master

分支 (1)

管理

管理

master

open-source-sharing
/
利用关键词爬取京东平台商品信息

import requests
from bs4 import BeautifulSoup
import csv
import time


def get_jd_products(keyword, max_page=1):
    products = []
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36'}

    for page in range(1, max_page + 1):
        url = f"https://search.jd.com/Search?keyword={keyword}&enc=utf-8&page={page}"
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, "html.parser")
        for item in soup.select("li.gl-item"):
            name = item.select_one("div.p-name a em").text.strip()
            price = item.select_one("div.p-price i").text.strip()
            link = 'https:' + item.select_one("div.p-name a")["href"].strip()
            response = requests.get(link, headers=headers)
            details_soup = BeautifulSoup(response.text, "html.parser")
            try:
                product_id = \
                    details_soup.find("ul", {"class": "parameter2 p-parameter-list"}).find("li").text.split("：")[
                        1].strip()
            except AttributeError:
                product_id = "N/A"

            # Retrieve the comment count for the product
            try:
                comment_count = details_soup.select_one(".comment-count").text.strip().split()[1]
            except AttributeError:
                comment_count = "N/A"

            # Retrieve the store name for the product
            try:
                store_name = details_soup.select_one(".name a").text.strip()
            except AttributeError:
                store_name = "N/A"

            products.append({"name": name, "price": price, "product_id": product_id, "link": link, "comment_count": comment_count, "store_name": store_name})

        time.sleep(5)

    # Sort the products by comment count in descending order and return the top 50
    sorted_products = sorted(products, key=lambda p: int(p["comment_count"]) if p["comment_count"] != "N/A" else 0, reverse=True)
    return sorted_products[:50]


def save_products_to_csv(products, filename):
    with open(filename, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["名称", "价格", "货号", "链接", "评论数", "店铺名称"])
        for product in products:
            writer.writerow([product["name"], product["price"], product["product_id"], product["link"], product["comment_count"], product["store_name"]])
    print(f"商品信息已保存至{filename}！")


if __name__ == "__main__":
    keyword = "清风原木"
    max_page = 10
    products = get_jd_products(keyword, max_page=max_page)
    filename = f"{keyword}_{max_page}页_评论数前50.csv"
    save_products_to_csv(products, filename)