master

分支 (1)

管理

管理

master

request-craw
/
BlibCrawlers.py

import json
import re
import subprocess
import requests
from pprint import pprint


# 爬取b站普通视频  有水印
def BlibCraw(url):
    headers = {
        # 防盗链 referer
        'referer': 'https://www.bilibili.com/',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 '
                      'Safari/537.36 Edg/105.0.1343.27 '
    }
    response = requests.get(url=url, headers=headers)
    # print(response.text)
    title = re.findall('<title data-vue-meta="true">(.*?)</title>', response.text)[0]
    # print(title)
    title = title.replace(' ', '_')
    html_data = re.findall('<script>window.__playinfo__=(.*?)</script>', response.text)[0]
    # print(html_data)
    json_data = json.loads(html_data)
    # pprint(json_data)
    # 字典数据  b站视频  和 音频是分离的
    audio_url = json_data['data']['dash']['audio'][0]['baseUrl']
    video_url = json_data['data']['dash']['video'][0]['baseUrl']
    # print(audio_url)
    # print(video_url)
    # 直接访问audio_url 和 video_url    是403  没有权限访问  需要上方headers中的防盗脸
    audio_content = requests.get(url=audio_url, headers=headers).content
    video_content = requests.get(url=video_url, headers=headers).content
    # 获得音频内容和视频内容
    with open('audio\\' + title + '.mp3', mode='wb') as audio:
        audio.write(audio_content)
    with open('video\\' + title + '.mp4', mode='wb') as video:
        video.write(video_content)
    # 音频内容和视频内容合并
    cmd = f"ffmpeg -i E:\\爬虫\\video\\{title}.mp4 -i E:\\爬虫\\audio\\{title}.mp3 -c:v copy -c:a aac -strict experimental " \
          f"E:\\爬虫\\video\\{title}output.mp4 "
    # print(cmd)
    subprocess.run(cmd, shell=True)


# 爬取b站番剧  需要会员的只能爬取试看三分钟      有水印
def BlibCrawForep(url):
    headers = {
        # 防盗链 referer
        'referer': 'https://www.bilibili.com/',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                      'Chrome/105.0.0.0 '
                      'Safari/537.36 Edg/105.0.1343.27 '
    }
    response = requests.get(url=url, headers=headers)
    # print(response.text)
    title = re.findall('<h1 title=".*?">(.*?)</h1>', response.text)[0]
    # print(title)
    title = title.replace(' ', '_')
    html_data = re.findall('<script>window.__playinfo__=(.*?)</script>', response.text)[0]
    # print(html_data)
    json_data = json.loads(html_data)
    # print(type(json_data))
    # pprint(json_data)
    # 试看番剧
    if 'durl' in json_data['data'].keys():
        # 番剧试看视频和音频未分离
        video_url = json_data['data']['durl'][0]['url']
        # 直接访问video_url    是403  没有权限访问  需要上方headers中的防盗脸
        video_content = requests.get(url=video_url, headers=headers).content
        with open('video\\' + title + '.mp4', mode='wb') as video:
            video.write(video_content)
    # 可以直接观看的番剧  非会员内容
    else:
        # 字典数据  b站视频  和 音频是分离的
        audio_url = json_data['data']['dash']['audio'][0]['baseUrl']
        video_url = json_data['data']['dash']['video'][0]['baseUrl']
        # print(audio_url)
        # print(video_url)
        # 直接访问audio_url 和 video_url    是403  没有权限访问  需要上方headers中的防盗脸
        audio_content = requests.get(url=audio_url, headers=headers).content
        video_content = requests.get(url=video_url, headers=headers).content
        # 获得音频内容和视频内容
        with open('audio\\' + title + '.mp3', mode='wb') as audio:
            audio.write(audio_content)
        with open('video\\' + title + '.mp4', mode='wb') as video:
            video.write(video_content)
        # 音频内容和视频内容合并
        cmd = f"ffmpeg -i E:\\爬虫\\video\\{title}.mp4 -i E:\\爬虫\\audio\\{title}.mp3 -c:v copy -c:a aac -strict experimental " \
              f"E:\\爬虫\\video\\{title}output.mp4 "
        # print(cmd)
        subprocess.run(cmd, shell=True)