代码拉取完成,页面将自动刷新
import re,time
from settings import *
import run_log
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from lxml import etree
import json
def share_link_analysis(share_link):
chrome_options = Options()
chrome_options.add_argument('--headless')
driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=webdriver_path)
run_log.logger.info("启动chrome浏览器")
driver.implicitly_wait(WEBDRIVER_IMPLICITLY_WAIT)
run_log.logger.debug("设置全局显示等待{}秒钟".format(WEBDRIVER_IMPLICITLY_WAIT))
driver.get(share_link)
time.sleep(1)
page_source = driver.page_source
selector = etree.HTML(page_source)
head_info = selector.xpath('//script[@type = "application/ld+json"]/text()')[0]
head_info = json.loads(head_info)
run_log.logger.debug(head_info)
body_info = selector.xpath('/html/body/script/text()')[0]
body_info = re.findall(r'censorTip":"","user":(.*),"poi":', body_info)[0]
body_info = json.loads(body_info)
run_log.logger.debug(head_info)
keywords = selector.xpath('//title/text()')[0]
content = " ".join(selector.xpath('//*[@id="app"]//main/div/p/text()'))
run_log.logger.debug(content)
# 拼接字段
info = {}
info.update(head_info)
info.update(body_info)
info["keywords"] = keywords
info["content"] = content
# run_log.logger.debug(info)
info_key = ['@type', 'nickname', 'red_id', 'headline', 'keywords', 'content', 'datePublished', 'uploadDate']
info_lite = {k: v for k, v in info.items() if k in info_key}
run_log.logger.info(info_lite)
time.sleep(1)
driver.quit()
run_log.logger.info("退出chrome浏览器")
return info_lite
def main():
info = share_link_analysis("https://www.xiaohongshu.com/discovery/item/60ddba2500000000010252dd")
print(info)
if __name__ == '__main__':
main()
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。