代码拉取完成,页面将自动刷新
import requests
from lxml import etree
from queue import Queue
from threading import Thread
import json
class T(object):
def __init__(self):
self.base_url = 'https://adidas.tmall.com/i/asynSearch.htm?_ksTS=1521885321416_124&callback=jsonp125&mid=w-14687612648-0&wid=14687612648&path=/search.htm&spm=a1z10.5-b-s.w4010-14694769136.2.6ba81ca2c4mcBi&search=y&pageNo={}'
self.headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
'upgrade-insecure-requests': '1',
'cookie': 'hng=CN%7Czh-CN%7CCNY%7C156; cna=5xo2E2pQWysCAdOKFKrSMlAi; uss=; otherx=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0; '
't=7d20fa48525a43004df64f8e6334b995; uc3=nk2=o%2FNRjrQWSd7Hug%3D%3D&id2=UNJQ7f5DyLvB&vt3=F8dBz4KArxGmiprHtTg%3D&lg2=V32FPkk%2Fw0dUvg%3D%3D; '
'tracknick=%5Cu674E%5Cu6D77%5Cu632F7515; lgc=%5Cu674E%5Cu6D77%5Cu632F7515; _tb_token_=e53e16f359135; cookie2=1657e9232249981d7c30f645eb1b9efc; _'
'uab_collina=152188184681784369577723; _umdata=486B7B12C6AA95F2DA571022EEA8571FA298E12220DD73F8358813D5DD5AD4A7E49EC39E3CA953D0CD43AD3E795C914C6E426C75F66C22C316F4864FB771E439; '
'cq=ccp%3D1; pnm_cku822=; isg=BAwM09Uouulx-a6QgT0HRRio3Wr-7dP_jsZljmbNErda8az7jlWAfwJDlflJouhH; swfstore=63843',
'referer': 'https://adidas.tmall.com/category-1361476884.htm?spm=a1z10.1-b-s.w11827624-14707335302.12.64c65597nXCXZD',
'authority': 'adidas.tmall.com',
'method': 'GET',
'path': '/category.htm?spm=a1z10.5-b-s.w4010-14694769136.2.6ba81ca2c4mcBi&search=y',
'scheme': 'https',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'max-age=0',
}
# 创建列表页队列
self.page_url_queue = Queue()
# 创建商品数据队列
self.product_info_queue = Queue()
print('爬虫启动')
# 发送请求,获取响应
def get_response(self, url):
response = requests.get(url=url, headers=self.headers)
return response
# 拼接url,获取所有列表页
def get_page_url(self):
for i in range(1, 2):
page_url = self.base_url.format(i)
if self.get_response(page_url).status_code == 200:
self.page_url_queue.put(page_url)
print('获取列表页面第------{}-----------页,放入队列中,等待处理。。。。。。。。。'.format(i))
# 提取数据
def parse_data(self):
while self.page_url_queue.not_empty:
try:
page_url = self.page_url_queue.get(timeout=3)
except Exception as e:
print(e)
break
result = self.get_response(page_url)
response = result.content.decode('gbk')
html = etree.HTML(response)
node_list = html.xpath('//dd[@class=\\"detail"]')
print(len(node_list))
for node in node_list:
product = {}
product['info'] = node.xpath('./a/text()')[0].strip(' ')
product['price'] = node.xpath('./dd/div/div[1]/span[2]/text()')[0].strip(' ')
print('处理数据,放入数据队列中,等待写入文件。。。。。。。')
self.product_info_queue.put(product)
# 写入文件
def write(self):
while self.product_info_queue.not_empty:
try:
product = self.product_info_queue.get(timeout=3)
except Exception as e:
print(e)
break
j_product = json.dumps(product,ensure_ascii=False)
with open('response.json', 'a')as f:
f.write(j_product + ',\n')
print('正在写入数据。。。。。。。。。。')
def run(self):
thread_list = []
t1 = Thread(target=self.get_page_url)
thread_list.append(t1)
for j in range(3):
j = Thread(target=self.parse_data)
thread_list.append(j)
t3 = Thread(target=self.write)
thread_list.append(t3)
for i in thread_list:
i.start()
i.join()
if __name__ == '__main__':
t = T()
t.run()
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。