master

分支 (1)

管理

管理

master

PaChongXiangMu
/
taobao.py

import requests
from lxml import etree
from queue import Queue
from threading import Thread
import json

class T(object):
    def __init__(self):
        self.base_url = 'https://adidas.tmall.com/i/asynSearch.htm?_ksTS=1521885321416_124&callback=jsonp125&mid=w-14687612648-0&wid=14687612648&path=/search.htm&spm=a1z10.5-b-s.w4010-14694769136.2.6ba81ca2c4mcBi&search=y&pageNo={}'
        self.headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
            'upgrade-insecure-requests': '1',
            'cookie': 'hng=CN%7Czh-CN%7CCNY%7C156; cna=5xo2E2pQWysCAdOKFKrSMlAi; uss=; otherx=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0; '
                      't=7d20fa48525a43004df64f8e6334b995; uc3=nk2=o%2FNRjrQWSd7Hug%3D%3D&id2=UNJQ7f5DyLvB&vt3=F8dBz4KArxGmiprHtTg%3D&lg2=V32FPkk%2Fw0dUvg%3D%3D; '
                      'tracknick=%5Cu674E%5Cu6D77%5Cu632F7515; lgc=%5Cu674E%5Cu6D77%5Cu632F7515; _tb_token_=e53e16f359135; cookie2=1657e9232249981d7c30f645eb1b9efc; _'
                      'uab_collina=152188184681784369577723; _umdata=486B7B12C6AA95F2DA571022EEA8571FA298E12220DD73F8358813D5DD5AD4A7E49EC39E3CA953D0CD43AD3E795C914C6E426C75F66C22C316F4864FB771E439; '
                      'cq=ccp%3D1; pnm_cku822=; isg=BAwM09Uouulx-a6QgT0HRRio3Wr-7dP_jsZljmbNErda8az7jlWAfwJDlflJouhH; swfstore=63843',
            'referer': 'https://adidas.tmall.com/category-1361476884.htm?spm=a1z10.1-b-s.w11827624-14707335302.12.64c65597nXCXZD',
            'authority': 'adidas.tmall.com',
            'method': 'GET',
            'path': '/category.htm?spm=a1z10.5-b-s.w4010-14694769136.2.6ba81ca2c4mcBi&search=y',
            'scheme': 'https',
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'zh-CN,zh;q=0.9',
            'cache-control': 'max-age=0',

        }

        # 创建列表页队列
        self.page_url_queue = Queue()
        # 创建商品数据队列
        self.product_info_queue = Queue()
        print('爬虫启动')

    # 发送请求，获取响应
    def get_response(self, url):
        response = requests.get(url=url, headers=self.headers)
        return response

    # 拼接url，获取所有列表页
    def get_page_url(self):
        for i in range(1, 2):
            page_url = self.base_url.format(i)
            if self.get_response(page_url).status_code == 200:
                self.page_url_queue.put(page_url)
                print('获取列表页面第------{}-----------页，放入队列中，等待处理。。。。。。。。。'.format(i))


    # 提取数据
    def parse_data(self):
        while self.page_url_queue.not_empty:
            try:
                page_url = self.page_url_queue.get(timeout=3)
            except Exception as e:
                print(e)
                break
            result = self.get_response(page_url)
            response = result.content.decode('gbk')
            html = etree.HTML(response)
            node_list = html.xpath('//dd[@class=\\"detail"]')
            print(len(node_list))

            for node in node_list:
                product = {}
                product['info'] = node.xpath('./a/text()')[0].strip(' ')
                product['price'] = node.xpath('./dd/div/div[1]/span[2]/text()')[0].strip(' ')
                print('处理数据，放入数据队列中，等待写入文件。。。。。。。')
                self.product_info_queue.put(product)

    # 写入文件
    def write(self):
        while self.product_info_queue.not_empty:
            try:
                product = self.product_info_queue.get(timeout=3)
            except Exception as e:
                print(e)
                break

            j_product = json.dumps(product,ensure_ascii=False)
            with open('response.json', 'a')as f:
                f.write(j_product + ',\n')
                print('正在写入数据。。。。。。。。。。')

    def run(self):
        thread_list = []
        t1 = Thread(target=self.get_page_url)
        thread_list.append(t1)
        for j in range(3):
            j = Thread(target=self.parse_data)
            thread_list.append(j)
        t3 = Thread(target=self.write)
        thread_list.append(t3)

        for i in thread_list:
            i.start()
            i.join()


if __name__ == '__main__':
    t = T()
    t.run()