加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
WX_Article.py 6.17 KB
一键复制 编辑 原始数据 按行查看 历史
糖炒栗子 提交于 2020-03-03 00:17 . Initial commit
# -*- coding: utf-8 -*-
import re
from time import sleep
from bs4 import BeautifulSoup
import requests
from requests.cookies import RequestsCookieJar
import hashlib
import pymysql
import oss2
import config
from math import ceil
class GetAriticle(object):
def __init__(self, token, cookie, query_name):
self.auth = oss2.Auth(config.oss_appid, config.oss_secretkey)
self.bucket = oss2.Bucket(self.auth, config.oss_url, config.oss_bucket)
self.conn = pymysql.connect(host=config.mysql_host,
user=config.mysql_username,
password=config.mysql_password,
database=config.mysql_database)
self.time_gap = 30
self.token = token
self.query_name = query_name
self.headers = {
'Host': 'mp.weixin.qq.com',
'User-Agent': r'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0',
}
self.sess = requests.Session()
self.c = RequestsCookieJar()
for i in cookie:
self.c.set(i["name"], i["value"])
self.sess.cookies.update(self.c)
def Get_WeChat_Subscription(self):
url = r'https://mp.weixin.qq.com/cgi-bin/searchbiz?action=search_biz&token={0}&lang=zh_CN&f=json&ajax=1&rando' \
r'm=0.5182749224035845&query={1}&begin=0&count=5'.format(self.token, self.query_name)
html_json = self.sess.get(url, headers=self.headers).json()
print(html_json)
fakeid = html_json['list'][0]['fakeid']
nickname = html_json['list'][0]['nickname']
print("fakeid:", fakeid)
print("nickname: ", nickname)
return fakeid
def Get_Articles(self, fake_id):
title_buf = []
link_buf = []
img_buf = []
Total_buf = []
url = r'https://mp.weixin.qq.com/cgi-bin/appmsg?token={0}&lang=zh_CN&f=json&ajax=1&random=0.977467295649225&' \
r'action=list_ex&begin=0&count=5&query=&fakeid={1}&type=9'.format(self.token, fake_id)
html_json = self.sess.get(url, headers=self.headers).json()
try:
Total_Page = ceil(int(html_json['app_msg_cnt'])/5)
except Exception as e:
print(e)
print("!! 失败信息:", html_json['base_resp']['err_msg'])
return
for i in range(Total_Page):
print("第[%d/%d]页" % (i+1, Total_Page))
begin = i * 5
url = r'https://mp.weixin.qq.com/cgi-bin/appmsg?token={0}&lang=zh_CN&f=json&ajax=1&' \
r'random=0.977467295649225&action=list_ex&begin={1}&count=5&query=&fakeid={2}&type=9'.\
format(self.token, begin, fake_id)
html_json = self.sess.get(url, headers=self.headers).json()
# print(html_json)
app_msg_list = html_json['app_msg_list']
if str(app_msg_list) == '[]':
break
for j in range(20):
try:
if app_msg_list[j]['title'] in Total_buf:
print("本条已存在,跳过")
continue
title_buf.append(app_msg_list[j]['title'])
Total_buf.append(app_msg_list[j]['title'])
link_buf.append(app_msg_list[j]['link'])
img_buf.append(app_msg_list[j]['cover'])
with open('/spider.txt', 'a+') as fp:
fp.write('*'*60+'\nTitle: '+title_buf[j]+'\nLink: '+link_buf[j]+'\nImg: '+img_buf[j]+'\r\n')
fp.close()
print(">> 第%d条写入完成:%s" % (j+1, title_buf[j]))
except Exception as e:
print(">> 本页抓取结束")
print(e)
break
print(">> 一页抓取结束,开始下载")
self.get_content(title_buf, link_buf)
title_buf.clear() # 清除缓存
print(">> 休息 %d s" % self.time_gap)
sleep(self.time_gap)
print(">> 抓取结束")
def get_content(self, title_buf, link_buf): # 获取地址对应的文章内容
length = len(title_buf)
for index in range(length):
each_title = re.sub(r'[\|\/\<\>\:\*\?\\\"]', "_", title_buf[index]) # 剔除不合法字符
html = self.sess.get(link_buf[index], headers=self.headers)
soup = BeautifulSoup(html.text, 'lxml')
article = soup.find(class_="rich_media_content").find_all("p") # 查找文章内容位置
img_urls = soup.find(class_="rich_media_content").find_all("img") # 获得文章图片URL集
print("*" * 60)
print(each_title)
print(">> 保存文档 - ", end="")
content_item = ''
for i in article:
line_content = i.get_text() # 获取标签内的文本
# print(line_content)
if line_content is not None:
content_item = content_item + line_content + '\n'
content_result = str(content_item).replace("'", "")
print("完毕!")
print(">> 保存图片 - %d张" % len(img_urls), end="")
pic_item = []
for i in range(len(img_urls)):
pic_down = requests.get(img_urls[i]["data-src"])
pic_md5 = hashlib.md5(str(pic_down.content).encode('utf-8')).hexdigest()
pic_item.append(pic_md5)
self.bucket.put_object('{}/{}.jpeg'.format(config.oss_save_dir, pic_md5), pic_down)
print('\n' + '*' * 60 + '\n{}/{}.jpeg'.format(config.oss_save_path, pic_md5))
pic_result = str(pic_item).replace("'", "").replace('[', '').replace(']', '')
cursor = self.conn.cursor()
sql = "INSERT INTO wechat(title, content, from_type, img, status) VALUES('{}', '{}', '{}', '{}', '{}')"\
.format(each_title, content_result, self.query_name, pic_result, '1')
print(sql)
cursor.execute(sql)
self.conn.commit()
print("完毕!\r\n")
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化