加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
BlibCrawlers.py 4.11 KB
一键复制 编辑 原始数据 按行查看 历史
GuLu 提交于 2022-09-18 17:19 . request爬虫
import json
import re
import subprocess
import requests
from pprint import pprint
# 爬取b站普通视频 有水印
def BlibCraw(url):
headers = {
# 防盗链 referer
'referer': 'https://www.bilibili.com/',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 '
'Safari/537.36 Edg/105.0.1343.27 '
}
response = requests.get(url=url, headers=headers)
# print(response.text)
title = re.findall('<title data-vue-meta="true">(.*?)</title>', response.text)[0]
# print(title)
title = title.replace(' ', '_')
html_data = re.findall('<script>window.__playinfo__=(.*?)</script>', response.text)[0]
# print(html_data)
json_data = json.loads(html_data)
# pprint(json_data)
# 字典数据 b站视频 和 音频是分离的
audio_url = json_data['data']['dash']['audio'][0]['baseUrl']
video_url = json_data['data']['dash']['video'][0]['baseUrl']
# print(audio_url)
# print(video_url)
# 直接访问audio_url 和 video_url 是403 没有权限访问 需要上方headers中的防盗脸
audio_content = requests.get(url=audio_url, headers=headers).content
video_content = requests.get(url=video_url, headers=headers).content
# 获得音频内容和视频内容
with open('audio\\' + title + '.mp3', mode='wb') as audio:
audio.write(audio_content)
with open('video\\' + title + '.mp4', mode='wb') as video:
video.write(video_content)
# 音频内容和视频内容合并
cmd = f"ffmpeg -i E:\\爬虫\\video\\{title}.mp4 -i E:\\爬虫\\audio\\{title}.mp3 -c:v copy -c:a aac -strict experimental " \
f"E:\\爬虫\\video\\{title}output.mp4 "
# print(cmd)
subprocess.run(cmd, shell=True)
# 爬取b站番剧 需要会员的只能爬取试看三分钟 有水印
def BlibCrawForep(url):
headers = {
# 防盗链 referer
'referer': 'https://www.bilibili.com/',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/105.0.0.0 '
'Safari/537.36 Edg/105.0.1343.27 '
}
response = requests.get(url=url, headers=headers)
# print(response.text)
title = re.findall('<h1 title=".*?">(.*?)</h1>', response.text)[0]
# print(title)
title = title.replace(' ', '_')
html_data = re.findall('<script>window.__playinfo__=(.*?)</script>', response.text)[0]
# print(html_data)
json_data = json.loads(html_data)
# print(type(json_data))
# pprint(json_data)
# 试看番剧
if 'durl' in json_data['data'].keys():
# 番剧试看视频和音频未分离
video_url = json_data['data']['durl'][0]['url']
# 直接访问video_url 是403 没有权限访问 需要上方headers中的防盗脸
video_content = requests.get(url=video_url, headers=headers).content
with open('video\\' + title + '.mp4', mode='wb') as video:
video.write(video_content)
# 可以直接观看的番剧 非会员内容
else:
# 字典数据 b站视频 和 音频是分离的
audio_url = json_data['data']['dash']['audio'][0]['baseUrl']
video_url = json_data['data']['dash']['video'][0]['baseUrl']
# print(audio_url)
# print(video_url)
# 直接访问audio_url 和 video_url 是403 没有权限访问 需要上方headers中的防盗脸
audio_content = requests.get(url=audio_url, headers=headers).content
video_content = requests.get(url=video_url, headers=headers).content
# 获得音频内容和视频内容
with open('audio\\' + title + '.mp3', mode='wb') as audio:
audio.write(audio_content)
with open('video\\' + title + '.mp4', mode='wb') as video:
video.write(video_content)
# 音频内容和视频内容合并
cmd = f"ffmpeg -i E:\\爬虫\\video\\{title}.mp4 -i E:\\爬虫\\audio\\{title}.mp3 -c:v copy -c:a aac -strict experimental " \
f"E:\\爬虫\\video\\{title}output.mp4 "
# print(cmd)
subprocess.run(cmd, shell=True)
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化