加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
pdf_download.py 2.98 KB
一键复制 编辑 原始数据 按行查看 历史
renoldt 提交于 2021-05-21 17:20 . first commit
# coding = UTF-8
# 爬取自己编写的html链接中的PDF文档,网址:file:///E:/ZjuTH/Documents/pythonCode/pythontest.html
import urllib.request
import re
import os
from bs4 import BeautifulSoup
import requests
import shutil
import datetime
def getHtmlNew(url):
# url = 'http://jb.sznews.com/PC/layout/202105/20/node_A01.html'
response = requests.get(url)
response.encoding=response.apparent_encoding
html = BeautifulSoup(response.text,'lxml')
content=html.select('.Therestlist')
# print(str(content[0]))
# exit()
return str(content[0])
# print(content[0])
# exit()
# return content[0].get_text()
# open the url and read
def getHtml(url):
page = urllib.request.urlopen(url)
html = page.read()
html = re.findall('(?<=TherestList/).*$', html)
print (html)
exit()
page.close()
return html
# compile the regular expressions and find
# all stuff we need
def getUrl(html, reg):
url_re = re.compile(reg)
# url_lst = url_re.findall(html.decode('UTF-8')) #返回匹配的数组
url_lst = url_re.findall(html)
# print (url_lst)
# exit()
return(url_lst)
def getFile(url, realName):
file_name = url.split('/')[-1]
u = urllib.request.urlopen(url)
f = open(file_name, 'wb')
block_sz = 8192
while True:
buffer = u.read(block_sz)
if not buffer:
break
f.write(buffer)
f.close()
print ("Sucessful to download" + " " + realName)
root_url = 'http://jb.sznews.com/' #下载地址中相同的部分
dateFormat = input("请输入日期数据,格式参考(20210520): ")
if len(dateFormat) ==0 or len(dateFormat) == 0:
dateFormat = datetime.datetime.now().strftime('%Y%m%d')
# print (dateFormat)
# exit()
dateStr = dateFormat[0:6]+"/"+dateFormat[6:]
raw_url = 'http://jb.sznews.com/PC/layout/'+dateStr+'/node_A01.html'
# print (raw_url)
# exit()
reg = r'(?:href|HREF)="?((?:http://)?.+?\.pdf)'
reg2 = r'(_[A-Z]\d+)'
html = getHtmlNew(raw_url)
url_lst = getUrl(html,reg)
url_lst2 = getUrl(html,reg2)
# print(len(url_lst2))
# exit()
# os.mkdir('pdf_download')
# os.chdir(os.path.join(os.getcwd(), 'pdf_download'))
#创建文件夹
path = 'jb_pdf_download'+'\\'+ dateFormat
# def create_folder(path):
if not os.path.exists(path):
# os.mkdir(path)
os.makedirs(path)
os.chdir(os.path.join(os.getcwd(), path))
# for url in url_lst[:]:
# # url = root_url + url+'/'+url+'.pdf' #形成完整的下载地址
# url = root_url + url[12: ]
# print(url)
# # exit()
# getFile(url)
# exit()
for x in range(len(url_lst)):
url = url_lst[x]
name = url_lst2[x]
url = root_url + url[12:]
end = "/"
string2 = url[url.rfind(end):]
oldName = string2[1:]
newName = '第' + name[1:] + '版.pdf'
# create_folder(path)
curl = os.getcwd()
# realUrl = curl + "\\" + path
# print (curl+"\\"+oldName, curl+"\\"+newName)
# print(url)
# exit()
getFile(url, newName)
shutil.move(curl+"\\"+oldName, curl+"\\"+newName)
# exit()
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化