master

分支 (1)

管理

管理

master

wangtaodiwuzhouzuoye
/
第二章作业2.py

import urllib.request as ur
import lxml.etree as le
import re

url = 'https://so.csdn.net/so/search/s.do?p={page}&q={keyword}&t=blog&viparticle=&domain=&o=&s=&u=&l=&f=&rbg=0'

# 得到response的方法
def getResponse(url):
    req = ur.Request(
        url = url,
        headers={
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
        }
    )
    response = ur.urlopen(req).read()
    return response


if __name__ == '__main__':
    keyword = input('关键字:')
    pn_start = int(input('起始页:'))
    pn_end = int(input('终止页:'))

    # 遍历起始页到终止页
    for page in range(pn_start,pn_end+1):
        # 访问一级页面
        response = getResponse(
            url='https://so.csdn.net/so/search/s.do?p={page}&q={keyword}&t=blog&viparticle=&domain=&o=&s=&u=&l=&f=&rbg=0'.format(
                page=page, keyword=keyword)
        )

        # 访问二级页面，博客的链接
        hrefs = le.HTML(response).xpath('//div[@class="search-list-con"]/dl//span[@class="mr16"]/../../dt/div/a[1]/@href')
        # 遍历博客链接
        for href in hrefs:
            response_blog = getResponse(
                url = href,
            )
            title = le.HTML(response_blog).xpath('//h1[@class="title-article"]/text()')[0]
            # pass
            title = re.sub(
                r'[/\\:*"<>|?]', '', title
            )
            filepath = 'blog/%s.html'%title
            with open(filepath,'wb') as f:
                f.write(response_blog)
            print(title)