代码拉取完成,页面将自动刷新
# coding = UTF-8
# 爬取自己编写的html链接中的PDF文档,网址:file:///E:/ZjuTH/Documents/pythonCode/pythontest.html
import urllib.request
import re
import os
from bs4 import BeautifulSoup
import requests
import shutil
import datetime
def getHtmlNew(url):
# url = 'http://jb.sznews.com/PC/layout/202105/20/node_A01.html'
response = requests.get(url)
response.encoding=response.apparent_encoding
html = BeautifulSoup(response.text,'lxml')
content=html.select('.Therestlist')
# print(str(content[0]))
# exit()
return str(content[0])
# print(content[0])
# exit()
# return content[0].get_text()
# open the url and read
def getHtml(url):
page = urllib.request.urlopen(url)
html = page.read()
html = re.findall('(?<=TherestList/).*$', html)
print (html)
exit()
page.close()
return html
# compile the regular expressions and find
# all stuff we need
def getUrl(html, reg):
url_re = re.compile(reg)
# url_lst = url_re.findall(html.decode('UTF-8')) #返回匹配的数组
url_lst = url_re.findall(html)
# print (url_lst)
# exit()
return(url_lst)
def getFile(url, realName):
file_name = url.split('/')[-1]
u = urllib.request.urlopen(url)
f = open(file_name, 'wb')
block_sz = 8192
while True:
buffer = u.read(block_sz)
if not buffer:
break
f.write(buffer)
f.close()
print ("Sucessful to download" + " " + realName)
root_url = 'http://jb.sznews.com/' #下载地址中相同的部分
dateFormat = input("请输入日期数据,格式参考(20210520): ")
if len(dateFormat) ==0 or len(dateFormat) == 0:
dateFormat = datetime.datetime.now().strftime('%Y%m%d')
# print (dateFormat)
# exit()
dateStr = dateFormat[0:6]+"/"+dateFormat[6:]
raw_url = 'http://jb.sznews.com/PC/layout/'+dateStr+'/node_A01.html'
# print (raw_url)
# exit()
reg = r'(?:href|HREF)="?((?:http://)?.+?\.pdf)'
reg2 = r'(_[A-Z]\d+)'
html = getHtmlNew(raw_url)
url_lst = getUrl(html,reg)
url_lst2 = getUrl(html,reg2)
# print(len(url_lst2))
# exit()
# os.mkdir('pdf_download')
# os.chdir(os.path.join(os.getcwd(), 'pdf_download'))
#创建文件夹
path = 'jb_pdf_download'+'\\'+ dateFormat
# def create_folder(path):
if not os.path.exists(path):
# os.mkdir(path)
os.makedirs(path)
os.chdir(os.path.join(os.getcwd(), path))
# for url in url_lst[:]:
# # url = root_url + url+'/'+url+'.pdf' #形成完整的下载地址
# url = root_url + url[12: ]
# print(url)
# # exit()
# getFile(url)
# exit()
for x in range(len(url_lst)):
url = url_lst[x]
name = url_lst2[x]
url = root_url + url[12:]
end = "/"
string2 = url[url.rfind(end):]
oldName = string2[1:]
newName = '第' + name[1:] + '版.pdf'
# create_folder(path)
curl = os.getcwd()
# realUrl = curl + "\\" + path
# print (curl+"\\"+oldName, curl+"\\"+newName)
# print(url)
# exit()
getFile(url, newName)
shutil.move(curl+"\\"+oldName, curl+"\\"+newName)
# exit()
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。