加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
BenGangRBao.py 6.21 KB
一键复制 编辑 原始数据 按行查看 历史
Cwoner 提交于 2024-06-03 09:44 . first commit
# -*-coding:utf-8 -*-
# @Time: 2023/2/20 0020 下午 14:33
# @Author: Cwoner
# @Organization: CTIM
# @Software: PyCharm
import time
import requests
from lxml import etree
import hashlib
from setting import SAVEPATH
import os
import re
from urllib import parse
from tools.serverAPI import upload_file
from tools.give_an_alarm import send_email
class BenGangRBao():
def __init__(self,ccdate,id=''):#20230101
self.ccdate = ccdate
self.id = id
self.name = 'BenGangRBao'
print('初始化:',self.name)
self.mid = hashlib.md5((self.name+'_baozi').encode()).hexdigest()
if not os.path.isdir(SAVEPATH+self.mid):
os.mkdir(SAVEPATH+self.mid)
self.pubTime = self.get_pubTime()
def index(self,c=0):
url = "https://s1-cdn.eqxiu.com/eqs/s/page/240274568"
params = {
"code": "W73ePtvF",
"time": self.pubTime
}
try:
response = requests.get(url, params=params,timeout=10)
except:
c += 1
if c > 2:
send_email(f'报纸采集系统异常-{self.name}-{int(time.time())}',f'主页请求器异常,异常次数超3次,采集退出。报纸:{self.name}. mid: {self.mid}. 报纸日期:{self.ccdate}.')
return
return self.index(c)
if response.status_code == 200:
data = response.json()
data_list = data['list']
elements = data_list[0]['elements']
triggerGroup = data['list'][0]['properties']['triggerGroup']
for element in elements:
if 'content' in element and '>' + str(int(self.ccdate[6:])) + '<' in element['content']:
print(element)
id = element['id']
for tItem in triggerGroup:
if id == tItem['targetId']:
targetContent = tItem['targetContent']
if targetContent and 'eqxiu.cn' in targetContent:
url = parse.unquote(targetContent)
pubTime = self.get_pubTime(url)
s_params = re.search('s/(.*)\?',url).group(1)
self.get_imageUrl(s_params,pubTime)
def get_pubTime(self,url="https://q.eqxiu.com/s/W73ePtvF"):
headers = {
"authority": "q.eqxiu.com",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-language": "zh-CN,zh;q=0.9",
"cache-control": "max-age=0",
"sec-ch-ua": "\"Google Chrome\";v=\"111\", \"Not(A:Brand\";v=\"8\", \"Chromium\";v=\"111\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"Windows\"",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "same-origin",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
text = response.text
publishTime = re.search('publishTime: (\d+),',text,re.M).group(1)
return publishTime
def get_imageUrl(self,code,pubTime):
url = "https://s1-cdn.eqxiu.com/eqs/s/page/252057498"
params = {
"code": code,
"time": pubTime
}
response = requests.get(url, params=params)
if response.status_code == 200:
data = response.json()
code = data['code']
if code == 120607:
return
data_list = data['list']
if not data_list:
return
for item in data_list:
element = item['elements'][1]
title = item['name']
properties = element['properties']
if 'originSrc' in properties:
originSrc = properties['originSrc']
originSrc = 'https://asset.eqh5.com/' + originSrc
print(title,originSrc)
tid = hashlib.md5((self.name + title + self.ccdate).encode()).hexdigest()
file = tid + '.png'
self.__download(file,originSrc,title,tid)
def __download(self,file,url,title,tid,c=0):
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"
}
try:
response = requests.get(url, headers=headers, verify=False,timeout=10)
except:
c += 1
if c > 2:
send_email(f'报纸采集系统异常-{self.name}-{int(time.time())}',f'下载报纸数据异常,异常次数超3次,现已终止该页面的采集。页面地址:{url}。 报纸:{self.name}. mid: {self.mid}. 报纸日期:{self.ccdate}.')
return
return self.__download(file,url,title,tid,c)
if response.status_code == 200:
path = SAVEPATH + self.mid + '/' + self.ccdate + '/'
if not os.path.isdir(path):
os.mkdir(path)
with open(path + file,'wb',) as f:
f.write(response.content)
data = {
'entity_id': self.mid,
'title': title,
'tid': tid,
'file_name': file,
'origin_url': url,
'ndate': self.ccdate[:4] + '-' + self.ccdate[4:6] + '-' + self.ccdate[6:]
}
upload_file(data, response.content)
def run(self):
self.index()
if __name__ == '__main__':
ccdate = '20230106'
hr = BenGangRBao(ccdate)
hr.run()
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化