使用python写的一个过google 图片搜索爬虫解决思路
使用python写的一个过google 图片搜索爬虫
本人python小白,没学过python。。。
这里有一段python的源码
想求问这里实现搜索的那个最终url是什么
大概是是类似这种:
http://www.google.com/search?q=XXX&..........
生成url的代码下面粗体字标出来了。。大概是在那里 求问打什么 最终url是啥 感谢!
import requests
import re
import math
IMAGES_PER_PAGE = 100
search_url = 'http://www.google.com/search'
# Pretend to be Firefox v25.0
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:25.0) Gecko/20100101 Firefox/25.0'
}
# Image search for medium sized photos
params = {
'tbm': 'isch',
'tbs': 'isz:m,itp:photo'
}
def search_sync(query, npages, timeout=1.0):
results = []
for page in xrange(npages):
try:
results.extend(get_page_urls(query, page, timeout))
except requests.exceptions.RequestException:
continue
return results
def search_async(query, npages, timeout=1.0):
import gevent
def fetch_page(page):
try:
return get_page_urls(query, page, timeout)
except requests.exceptions.RequestException:
return []
jobs = [gevent.spawn(fetch_page, page) for page in xrange(npages)]
gevent.joinall(jobs, timeout=timeout)
results = []
for job in jobs:
if job.value:
results.extend(job.value)
return results
def search(query, max_results=100, timeout=1.0, async=True):
npages = int(math.ceil(max_results / float(IMAGES_PER_PAGE)))
search_func = search_async if async else search_sync
return search_func(query, npages, timeout)[:max_results]
def get_page_urls(query, page, timeout=None):
query_params = dict(q=query, ijn=page)
query_params.update(params)
response = requests.get(search_url, params=query_params,
headers=headers, timeout=timeout)
response.raise_for_status()
return extract_urls(response.text)
def extract_urls(html):
image_url_pattern = re.compile(r'imgurl=(.*?)&')
image_div_pattern = re.compile(r'<div class="rg_di"(.*?)</div>')
urls = []
divs = image_div_pattern.findall(html)
for div in divs:
image_url_match = image_url_pattern.search(div)
if image_url_match:
urls.append(image_url_match.group(1))
return urls
def parse_args():
from docopt import docopt
return docopt(__doc__, version='1.0')
def main():
args = parse_args()
max_results = int(args['-n'])
terms = args['<search-term>']
query = ' '.join(terms)
urls = search(query, max_results, async=False)
print '\n'.join(urls)
if __name__ == '__main__':
main()
------解决方案--------------------
我也是凭代码猜测,不如断点一下或者打印出来吧
本人python小白,没学过python。。。
这里有一段python的源码
想求问这里实现搜索的那个最终url是什么
大概是是类似这种:
http://www.google.com/search?q=XXX&..........
生成url的代码下面粗体字标出来了。。大概是在那里 求问打什么 最终url是啥 感谢!
import requests
import re
import math
IMAGES_PER_PAGE = 100
search_url = 'http://www.google.com/search'
# Pretend to be Firefox v25.0
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:25.0) Gecko/20100101 Firefox/25.0'
}
# Image search for medium sized photos
params = {
'tbm': 'isch',
'tbs': 'isz:m,itp:photo'
}
def search_sync(query, npages, timeout=1.0):
results = []
for page in xrange(npages):
try:
results.extend(get_page_urls(query, page, timeout))
except requests.exceptions.RequestException:
continue
return results
def search_async(query, npages, timeout=1.0):
import gevent
def fetch_page(page):
try:
return get_page_urls(query, page, timeout)
except requests.exceptions.RequestException:
return []
jobs = [gevent.spawn(fetch_page, page) for page in xrange(npages)]
gevent.joinall(jobs, timeout=timeout)
results = []
for job in jobs:
if job.value:
results.extend(job.value)
return results
def search(query, max_results=100, timeout=1.0, async=True):
npages = int(math.ceil(max_results / float(IMAGES_PER_PAGE)))
search_func = search_async if async else search_sync
return search_func(query, npages, timeout)[:max_results]
def get_page_urls(query, page, timeout=None):
query_params = dict(q=query, ijn=page)
query_params.update(params)
response = requests.get(search_url, params=query_params,
headers=headers, timeout=timeout)
response.raise_for_status()
return extract_urls(response.text)
def extract_urls(html):
image_url_pattern = re.compile(r'imgurl=(.*?)&')
image_div_pattern = re.compile(r'<div class="rg_di"(.*?)</div>')
urls = []
divs = image_div_pattern.findall(html)
for div in divs:
image_url_match = image_url_pattern.search(div)
if image_url_match:
urls.append(image_url_match.group(1))
return urls
def parse_args():
from docopt import docopt
return docopt(__doc__, version='1.0')
def main():
args = parse_args()
max_results = int(args['-n'])
terms = args['<search-term>']
query = ' '.join(terms)
urls = search(query, max_results, async=False)
print '\n'.join(urls)
if __name__ == '__main__':
main()
------解决方案--------------------
我也是凭代码猜测,不如断点一下或者打印出来吧