master

分支 (1)

管理

管理

master

tofu
/
web_element.py

import re,time
from settings import *
import run_log
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from lxml import etree
import json


def share_link_analysis(share_link):
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=webdriver_path)
    run_log.logger.info("启动chrome浏览器")

    driver.implicitly_wait(WEBDRIVER_IMPLICITLY_WAIT)
    run_log.logger.debug("设置全局显示等待{}秒钟".format(WEBDRIVER_IMPLICITLY_WAIT))

    driver.get(share_link)
    time.sleep(1)
    page_source = driver.page_source
    selector = etree.HTML(page_source)

    head_info = selector.xpath('//script[@type = "application/ld+json"]/text()')[0]
    head_info = json.loads(head_info)
    run_log.logger.debug(head_info)

    body_info = selector.xpath('/html/body/script/text()')[0]
    body_info = re.findall(r'censorTip":"","user":(.*),"poi":', body_info)[0]
    body_info = json.loads(body_info)
    run_log.logger.debug(head_info)

    keywords = selector.xpath('//title/text()')[0]

    content = " ".join(selector.xpath('//*[@id="app"]//main/div/p/text()'))
    run_log.logger.debug(content)

    # 拼接字段
    info = {}
    info.update(head_info)
    info.update(body_info)
    info["keywords"] = keywords
    info["content"] = content
    # run_log.logger.debug(info)

    info_key = ['@type', 'nickname', 'red_id', 'headline', 'keywords', 'content', 'datePublished', 'uploadDate']
    info_lite = {k: v for k, v in info.items() if k in info_key}
    run_log.logger.info(info_lite)

    time.sleep(1)
    driver.quit()
    run_log.logger.info("退出chrome浏览器")

    return info_lite


def main():
    info = share_link_analysis("https://www.xiaohongshu.com/discovery/item/60ddba2500000000010252dd")
    print(info)


if __name__ == '__main__':
    main()