加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
利用关键词爬取京东平台商品信息 2.79 KB
一键复制 编辑 原始数据 按行查看 历史
import requests
from bs4 import BeautifulSoup
import csv
import time
def get_jd_products(keyword, max_page=1):
products = []
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36'}
for page in range(1, max_page + 1):
url = f"https://search.jd.com/Search?keyword={keyword}&enc=utf-8&page={page}"
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
for item in soup.select("li.gl-item"):
name = item.select_one("div.p-name a em").text.strip()
price = item.select_one("div.p-price i").text.strip()
link = 'https:' + item.select_one("div.p-name a")["href"].strip()
response = requests.get(link, headers=headers)
details_soup = BeautifulSoup(response.text, "html.parser")
try:
product_id = \
details_soup.find("ul", {"class": "parameter2 p-parameter-list"}).find("li").text.split(":")[
1].strip()
except AttributeError:
product_id = "N/A"
# Retrieve the comment count for the product
try:
comment_count = details_soup.select_one(".comment-count").text.strip().split()[1]
except AttributeError:
comment_count = "N/A"
# Retrieve the store name for the product
try:
store_name = details_soup.select_one(".name a").text.strip()
except AttributeError:
store_name = "N/A"
products.append({"name": name, "price": price, "product_id": product_id, "link": link, "comment_count": comment_count, "store_name": store_name})
time.sleep(5)
# Sort the products by comment count in descending order and return the top 50
sorted_products = sorted(products, key=lambda p: int(p["comment_count"]) if p["comment_count"] != "N/A" else 0, reverse=True)
return sorted_products[:50]
def save_products_to_csv(products, filename):
with open(filename, 'w', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
writer.writerow(["名称", "价格", "货号", "链接", "评论数", "店铺名称"])
for product in products:
writer.writerow([product["name"], product["price"], product["product_id"], product["link"], product["comment_count"], product["store_name"]])
print(f"商品信息已保存至{filename}!")
if __name__ == "__main__":
keyword = "清风原木"
max_page = 10
products = get_jd_products(keyword, max_page=max_page)
filename = f"{keyword}_{max_page}页_评论数前50.csv"
save_products_to_csv(products, filename)
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化