加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
FuYangRBao.py 6.07 KB
一键复制 编辑 原始数据 按行查看 历史
Cwoner 提交于 2024-06-03 09:44 . first commit
# -*-coding:utf-8 -*-
# @Time: 2023/2/20 0020 下午 14:33
# @Author: Cwoner
# @Organization: CTIM
# @Software: PyCharm
import requests
from lxml import etree
import hashlib
from setting import SAVEPATH
import os
import re
from tools.serverAPI import upload_file
class FuYangRBao():
def __init__(self,ccdate,id=''):#20230101
self.url = self.get_index_url(ccdate)
self.ccdate = ccdate
self.id = id
self.name = '阜阳日报'
print('初始化:',self.name, self.url)
self.mid = hashlib.md5((self.name+'_baozi').encode()).hexdigest()
if not os.path.isdir(SAVEPATH+self.mid):
os.mkdir(SAVEPATH+self.mid)
def get_index_url(self,ccdate):
headers = {
"Accept": "application/json, text/javascript, */*",
"Accept-Language": "zh,zh-CN;q=0.9",
"Connection": "keep-alive",
"Content-Type": "application/x-www-form-urlencoded",
"Origin": "http://test.fynews.net",
"Referer": "http://test.fynews.net/epaper/read.do?m=i&iid=2858&idate=2_2023-04-11",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
"X-Requested-With": "XMLHttpRequest"
}
url = "http://test.fynews.net/epaper/read.do"
params = {
"m": "getIssueByMonth"
}
data = {
"newspaperId": "1",
"yyyymm": ccdate[:6]
}
response = requests.post(url, headers=headers,params=params, data=data, verify=False)
if response.status_code == 200:
data = response.json()['data']
for item in data:
path = item['path']
idateDisp = item['idateDisp']
if f'{ccdate[:4]}-{ccdate[4:6]}-{ccdate[6:]}' == idateDisp:
return 'http://test.fynews.net' + path
def index(self):
if not self.url:
print(f'{self.ccdate}无报纸!')
return
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"Referer": "http://epaper.hljnews.cn/hljrb/pc/layout/202302/20/",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"
}
response = requests.get(self.url, headers=headers, verify=False)
if response.status_code == 200:
text = response.content.decode('utf-8')
html = etree.HTML(text)
bans = html.xpath('//div[@class="contentNews"]//li')
i = 0
for banItem in bans:
title = banItem.xpath('string(.)')
if not title:
title = f'未知标题{i}'
title = re.sub('\s','',title)
print(self.name,title)
page_node = banItem.xpath('./span/a')[0]
page_url = f'http://test.fynews.net' + page_node.xpath('./@href')[0]
img_url = self.get_image_url(page_url)
tid = hashlib.md5((self.name+title+self.ccdate).encode()).hexdigest()
file = tid + '.jpg'
if img_url:
img_url = img_url.replace('../../../','')
print(img_url)
self.__download(file,img_url,title,tid)
def get_image_url(self,url):
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"Referer": "http://epaper.hljnews.cn/hljrb/pc/layout/202302/20/",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"
}
response = requests.get(url, headers=headers, verify=False)
if response.status_code == 200:
text = response.content.decode('utf-8')
html = etree.HTML(text)
img_url = html.xpath('//img[@class="snapshot_img"]/@src')[0].replace('../../../','')
img_url = 'http://test.fynews.net' + img_url
return img_url
else:
return None
def __download(self,file,url,title,tid,c=0):
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"
}
try:
response = requests.get(url, headers=headers, verify=False)
except:
if c > 2:
return
return self.__download(file,url,title,tid,c+1)
if response.status_code == 200:
path = SAVEPATH + self.mid + '/' + self.ccdate + '/'
if not os.path.isdir(path):
os.mkdir(path)
with open(path + file,'wb',) as f:
f.write(response.content)
data = {
'entity_id': self.mid,
'title': title,
'tid': tid,
'file_name': file,
'origin_url': url,
'ndate': self.ccdate[:4] + '-' + self.ccdate[4:6] + '-' + self.ccdate[6:]
}
upload_file(data, response.content)
def run(self):
self.index()
if __name__ == '__main__':
ccdate = '20230417'
hr = FuYangRBao(ccdate)
hr.run()
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化