京东
import urllib.requestimport reimport osdef crawl(url, page): html = urllib.request.urlopen(url).read() html = str(html) pattern = '' result = re.compile(pattern).findall(html) result = result[0] pattern2 = '' imglist = re.compile(pattern2).findall(result) x = 1 for imgurl in imglist: imgname = os.getcwd() + os.sep + str(page) + str(x) + ".jpg" imgurl = "http://" + imgurl try: urllib.request.urlretrieve(imgurl, filename=imgname) except urllib.error.URLError as e: if hasattr(e, "code"): x += 1 if hasattr(e, "reason"): x += 1 x += 1for i in range(1, 6): url = "https://list.jd.com/list.html?cat=9987,653,655&page=" + str(i) crawl(url, i)
千图网
import urllib.requestimport redef download_page(url): request = urllib.request.Request(url) reponse = urllib.request.urlopen(request) data = reponse.read() return datadef get_image(html): regx = r'http://[\S]*\.jpg' pattern = re.compile(regx) image = re.findall(pattern, repr(html)) # repr用来转换表达式类型字符串 num = 1 for img in image: i = download_page(img) # 将每个img链接重新解析 with open('%s.jpg' % num, 'wb') as fp: fp.write(i) print('正在下载第%s张图片' % num) num += 1 returnurl = 'http://www.58pic.com'html = download_page(url)get_image(html)