使用python写的一个过google 图片搜索爬虫解决思路

使用python写的一个过google 图片搜索爬虫
本人python小白，没学过python。。。
这里有一段python的源码
想求问这里实现搜索的那个最终url是什么
大概是是类似这种：
http://www.google.com/search?q=XXX&..........
生成url的代码下面粗体字标出来了。。大概是在那里求问打什么最终url是啥感谢！
import requests
import re
import math

IMAGES_PER_PAGE = 100

search_url = 'http://www.google.com/search'

# Pretend to be Firefox v25.0
headers = {
  'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:25.0) Gecko/20100101 Firefox/25.0'
}

# Image search for medium sized photos
params = {
  'tbm': 'isch',
  'tbs': 'isz:m,itp:photo'
}

def search_sync(query, npages, timeout=1.0):
  results = []
  for page in xrange(npages):
    try:
      results.extend(get_page_urls(query, page, timeout))
    except requests.exceptions.RequestException:
      continue
  return results

def search_async(query, npages, timeout=1.0):
  import gevent
  def fetch_page(page):
    try:
      return get_page_urls(query, page, timeout)
    except requests.exceptions.RequestException:
      return []
  jobs = [gevent.spawn(fetch_page, page) for page in xrange(npages)]
  gevent.joinall(jobs, timeout=timeout)
  results = []
  for job in jobs:
    if job.value:
      results.extend(job.value)
  return results

def search(query, max_results=100, timeout=1.0, async=True):
  npages = int(math.ceil(max_results / float(IMAGES_PER_PAGE)))
  search_func = search_async if async else search_sync
  return search_func(query, npages, timeout)[:max_results]

def get_page_urls(query, page, timeout=None):
  query_params = dict(q=query, ijn=page)
  query_params.update(params)
  response = requests.get(search_url, params=query_params,
    headers=headers, timeout=timeout)
  response.raise_for_status()
  return extract_urls(response.text)

def extract_urls(html):
  image_url_pattern = re.compile(r'imgurl=(.*?)&')
  image_div_pattern = re.compile(r'<div class="rg_di"(.*?)</div>')
  urls = []
  divs = image_div_pattern.findall(html)
  for div in divs:
    image_url_match = image_url_pattern.search(div)
    if image_url_match:
      urls.append(image_url_match.group(1))
  return urls

def parse_args():
  from docopt import docopt
  return docopt(__doc__, version='1.0')

def main():
  args = parse_args()
  max_results = int(args['-n'])
  terms = args['<search-term>']
  query = ' '.join(terms)
  urls = search(query, max_results, async=False)
  print '\n'.join(urls)

if __name__ == '__main__':
    main()
------解决方案--------------------

引用:

Quote: 引用:

def get_page_urls(query, page, timeout=None):
  query_params = dict(q=query, ijn=page)
  query_params.update(params)
  response = requests.get(search_url, params=query_params,
    headers=headers, timeout=timeout)

url + 参数

http://www.google.com/search?q=query&ijn=page&tbm=isch&tbs=isz:m,itp:photo

好像不太对呢。。。如果对的话输入浏览器应该就是谷歌的搜索页面

我也是凭代码猜测，不如断点一下或者打印出来吧

使用python写的一个过google 图片搜索爬虫解决思路

相关推荐