master

分支 (1)

管理

管理

master

batch-download-python
/
pdf_download.py

# coding = UTF-8
# 爬取自己编写的html链接中的PDF文档,网址：file:///E:/ZjuTH/Documents/pythonCode/pythontest.html

import urllib.request
import re
import os
from bs4 import BeautifulSoup
import requests
import shutil
import datetime

def getHtmlNew(url):
    # url = 'http://jb.sznews.com/PC/layout/202105/20/node_A01.html'
    response = requests.get(url)
    response.encoding=response.apparent_encoding
    html = BeautifulSoup(response.text,'lxml')
    content=html.select('.Therestlist')
    # print(str(content[0]))
    # exit()
    return  str(content[0])
# print(content[0])
# exit()
    # return content[0].get_text()

# open the url and read
def getHtml(url):
    page = urllib.request.urlopen(url)
    html = page.read()
    html = re.findall('(?<=TherestList/).*$', html)
    print (html)
    exit()
    page.close()
    return html

# compile the regular expressions and find
# all stuff we need
def getUrl(html, reg):
    url_re = re.compile(reg)
    # url_lst = url_re.findall(html.decode('UTF-8')) #返回匹配的数组
    url_lst = url_re.findall(html)
    # print (url_lst)
    # exit()
    return(url_lst)

def getFile(url, realName):
    file_name = url.split('/')[-1]
    u = urllib.request.urlopen(url)
    f = open(file_name, 'wb')

    block_sz = 8192
    while True:
        buffer = u.read(block_sz)
        if not buffer:
            break

        f.write(buffer)
    f.close()
    print ("Sucessful to download" + " " + realName)


root_url = 'http://jb.sznews.com/'  #下载地址中相同的部分

dateFormat = input("请输入日期数据，格式参考（20210520）: ")
if len(dateFormat) ==0 or len(dateFormat) == 0:
    dateFormat = datetime.datetime.now().strftime('%Y%m%d')
# print (dateFormat)
# exit()

dateStr = dateFormat[0:6]+"/"+dateFormat[6:]
raw_url = 'http://jb.sznews.com/PC/layout/'+dateStr+'/node_A01.html'
# print (raw_url)
# exit()

reg = r'(?:href|HREF)="?((?:http://)?.+?\.pdf)'
reg2 = r'(_[A-Z]\d+)'

html = getHtmlNew(raw_url)
url_lst = getUrl(html,reg)
url_lst2 = getUrl(html,reg2)
# print(len(url_lst2))
# exit()

# os.mkdir('pdf_download')
# os.chdir(os.path.join(os.getcwd(), 'pdf_download'))
#创建文件夹
path = 'jb_pdf_download'+'\\'+ dateFormat
# def create_folder(path):
if not os.path.exists(path):
    # os.mkdir(path)
    os.makedirs(path)
os.chdir(os.path.join(os.getcwd(), path))

# for url in url_lst[:]:
#     # url = root_url + url+'/'+url+'.pdf'  #形成完整的下载地址
#     url = root_url +  url[12: ]
#     print(url)
#     # exit()
#     getFile(url)
#     exit()

for x in range(len(url_lst)):
    url = url_lst[x]
    name = url_lst2[x]
    url = root_url + url[12:]
    end = "/"
    string2 = url[url.rfind(end):]
    oldName = string2[1:]
    newName = '第' + name[1:] + '版.pdf'
    # create_folder(path)
    curl = os.getcwd()
    # realUrl = curl + "\\" + path
    # print (curl+"\\"+oldName, curl+"\\"+newName)
    # print(url)
    # exit()
    getFile(url, newName)
    shutil.move(curl+"\\"+oldName, curl+"\\"+newName)
    # exit()