代码拉取完成,页面将自动刷新
同步操作将从 Cwoner/ctim_newspaper_spider 强制同步,此操作会覆盖自 Fork 仓库以来所做的任何修改,且无法恢复!!!
确定后同步将在后台操作,完成时将刷新页面,请耐心等待。
# -*-coding:utf-8 -*-
# @Time: 2023/2/20 0020 下午 14:33
# @Author: Cwoner
# @Organization: CTIM
# @Software: PyCharm
import requests
from lxml import etree
import hashlib
from setting import SAVEPATH
import os
import re
from tools.serverAPI import upload_file
class ShanWeiRBao():
def __init__(self,ccdate,id=''):#20230101
self.url = f'http://e.hznews.com/hzrb/pc/{ccdate[:6]}/{ccdate[6:]}/node_A01.html'
self.ccdate = ccdate
self.id = id
self.name = '汕尾日报'
print('初始化:',self.name, self.url)
self.mid = hashlib.md5((self.name+'_baozi').encode()).hexdigest()
if not os.path.isdir(SAVEPATH+self.mid):
os.mkdir(SAVEPATH+self.mid)
def get_date_data(self):
headers = {
"Accept": "application/json, text/javascript, */*; q=0.01",
"Accept-Language": "zh,zh-CN;q=0.9",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"Origin": "http://epaper.shanweinews.net",
"Pragma": "no-cache",
"Referer": "http://epaper.shanweinews.net/u/res/skin/epaper/htm/szb.html?siteid=2A90839E2034DF53854DA912D5496033",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
"X-Requested-With": "XMLHttpRequest"
}
cookies = {
"HA_STICKY_ucap1": "ucap1.srv2"
}
url = "http://epaper.shanweinews.net/u/epaper/issue/2A90839E2034DF53854DA912D5496033/ajaxList"
params = {
"issue": ""
}
data = {
"issue_time": f"{self.ccdate[:4]}-{self.ccdate[4:6]}-{self.ccdate[6:]}",
"issueid": ""
}
response = requests.post(url, headers=headers, cookies=cookies, params=params, data=data, verify=False)
if response.status_code == 200:
data = response.json()
self.ISSUE_ID = data['ISSUE_ID']
def index(self):
self.get_date_data()
headers = {
"Accept": "application/json, text/javascript, */*; q=0.01",
"Accept-Language": "zh,zh-CN;q=0.9",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Content-Length": "0",
"Origin": "http://epaper.shanweinews.net",
"Pragma": "no-cache",
"Referer": "http://epaper.shanweinews.net/u/res/skin/epaper/htm/szb.html?siteid=2A90839E2034DF53854DA912D5496033",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
"X-Requested-With": "XMLHttpRequest"
}
url = f"http://epaper.shanweinews.net/u/epaper/page/{self.ISSUE_ID}/ajaxList"
params = {
"pages": ""
}
response = requests.post(url, headers=headers, params=params, verify=False)
if response.status_code == 200:
data= response.json()
bans = data
x = 1
for ban in bans:
title = ban['page_name']
img_url = 'http://epaper.shanweinews.net' + ban['page_image_location']
tid = hashlib.md5((self.name + title + self.ccdate).encode()).hexdigest()
print(title,img_url)
file = tid + f'.{img_url.split(".")[-1]}'
self.__download(file,img_url,title,tid)
x += 1
def get_image_url(self,url):
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Accept-Language": "zh,zh-CN;q=0.9",
"Cache-Control": "no-cache",
"Pragma": "no-cache",
"Proxy-Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"
}
response = requests.get(url, headers=headers, verify=False)
if response.status_code == 200:
text = response.text
html = etree.HTML(text)
img = html.xpath('//img[@class="preview"]/@src')[0]
img_url = f'http://e.hznews.com/hzrb/pc/pic{self.ccdate[:6]}/{self.ccdate[6:]}/{img}'
return img_url
def __download(self,file,url,title,tid,c=0):
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"
}
try:
response = requests.get(url, headers=headers, verify=False)
except:
if c > 3:
return
c += 1
return self.__download(file,url,title,tid,c)
if response.status_code == 200:
path = SAVEPATH + self.mid + '/' + self.ccdate + '/'
if not os.path.isdir(path):
os.mkdir(path)
with open(path + file,'wb',) as f:
f.write(response.content)
data = {
'entity_id': self.mid,
'title': title,
'tid': tid,
'file_name': file,
'origin_url': url,
'ndate': self.ccdate[:4] + '-' + self.ccdate[4:6] + '-' + self.ccdate[6:]
}
upload_file(data, response.content)
def run(self):
rid = self.index()
if __name__ == '__main__':
ccdate = '20240408'
hr = ShanWeiRBao(ccdate)
hr.run()
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。