diff --git "a/20210340634\347\216\213\346\227\255/\347\254\254\344\270\200\346\254\241\344\275\234\344\270\232/\346\225\260\346\215\256\347\210\254\345\217\226.py" "b/20210340634\347\216\213\346\227\255/\347\254\254\344\270\200\346\254\241\344\275\234\344\270\232/\346\225\260\346\215\256\347\210\254\345\217\226.py" new file mode 100644 index 0000000000000000000000000000000000000000..ccd867900bf5f1502b26f10c7297566cd22a0f8f --- /dev/null +++ "b/20210340634\347\216\213\346\227\255/\347\254\254\344\270\200\346\254\241\344\275\234\344\270\232/\346\225\260\346\215\256\347\210\254\345\217\226.py" @@ -0,0 +1,19 @@ +import requests +from bs4 import BeautifulSoup + +urls = [f'https://www.cnblogs.com/#{page}' for page in range(1, 51)] + +# 生产者 +def craw(url): + r = requests.get(url) + return r.text + +# 消费者 +def parse(html): + soup = BeautifulSoup(html, "html.parser") + links = soup.find_all("a", class_="post-item-title") + return [(link["href"], link.get_text()) for link in links] + +if __name__ == "__main__": + for result in parse(craw(urls[2])): + print(result) diff --git "a/20210340634\347\216\213\346\227\255/\347\254\254\344\270\200\346\254\241\344\275\234\344\270\232/\346\225\260\346\215\256\347\210\254\345\217\2262.py" "b/20210340634\347\216\213\346\227\255/\347\254\254\344\270\200\346\254\241\344\275\234\344\270\232/\346\225\260\346\215\256\347\210\254\345\217\2262.py" new file mode 100644 index 0000000000000000000000000000000000000000..ea036fc05c7d4c945d89a0db1b0bbc1850130482 --- /dev/null +++ "b/20210340634\347\216\213\346\227\255/\347\254\254\344\270\200\346\254\241\344\275\234\344\270\232/\346\225\260\346\215\256\347\210\254\345\217\2262.py" @@ -0,0 +1,38 @@ +import queue +import 数据爬取 +import time +import random +import threading +def do_craw(url_queue: queue.Queue, html_queue: queue.Queue): + while True: + url = url_queue.get() + html = 数据爬取.craw(url) + html_queue.put(html) + # 打印相关日志 + print(threading.current_thread().name, f"craw {url}", + "url_queue.size=", url_queue.qsize()) + time.sleep(random.randint(1, 2)) +def do_parse(html_queue: queue.Queue, fout): + while True: + html = html_queue.get() + results = 数据爬取.parse(html) + for result in results: + fout.write(str(result) + "\n") + # 打印当前线程名字 + print(threading.current_thread().name, f"results.size", len(results), + "html_queue.size=", html_queue.qsize()) + time.sleep(random.randint(1, 2)) +if __name__ == "__main__": + url_queue = queue.Queue() + html_queue = queue.Queue() + for url in 数据爬取.urls: + url_queue.put(url) + + for idx in range(3): + t = threading.Thread(target=do_craw, args=(url_queue, html_queue), name=f"craw{idx}") + t.start() + + fout = open("02.data.txt", 'w') + for idx in range(2): + t = threading.Thread(target=do_parse, args=(html_queue, fout), name=f"parse{idx}") + t.start()