加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
web_element.py 1.87 KB
一键复制 编辑 原始数据 按行查看 历史
夏令 提交于 2022-02-18 03:24 . 小红书阅读辅助工具
import re,time
from settings import *
import run_log
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from lxml import etree
import json
def share_link_analysis(share_link):
chrome_options = Options()
chrome_options.add_argument('--headless')
driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=webdriver_path)
run_log.logger.info("启动chrome浏览器")
driver.implicitly_wait(WEBDRIVER_IMPLICITLY_WAIT)
run_log.logger.debug("设置全局显示等待{}秒钟".format(WEBDRIVER_IMPLICITLY_WAIT))
driver.get(share_link)
time.sleep(1)
page_source = driver.page_source
selector = etree.HTML(page_source)
head_info = selector.xpath('//script[@type = "application/ld+json"]/text()')[0]
head_info = json.loads(head_info)
run_log.logger.debug(head_info)
body_info = selector.xpath('/html/body/script/text()')[0]
body_info = re.findall(r'censorTip":"","user":(.*),"poi":', body_info)[0]
body_info = json.loads(body_info)
run_log.logger.debug(head_info)
keywords = selector.xpath('//title/text()')[0]
content = " ".join(selector.xpath('//*[@id="app"]//main/div/p/text()'))
run_log.logger.debug(content)
# 拼接字段
info = {}
info.update(head_info)
info.update(body_info)
info["keywords"] = keywords
info["content"] = content
# run_log.logger.debug(info)
info_key = ['@type', 'nickname', 'red_id', 'headline', 'keywords', 'content', 'datePublished', 'uploadDate']
info_lite = {k: v for k, v in info.items() if k in info_key}
run_log.logger.info(info_lite)
time.sleep(1)
driver.quit()
run_log.logger.info("退出chrome浏览器")
return info_lite
def main():
info = share_link_analysis("https://www.xiaohongshu.com/discovery/item/60ddba2500000000010252dd")
print(info)
if __name__ == '__main__':
main()
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化