Contents

Youtube手动下载视频和音频

Contents

其实所有的爬虫或者下载工具都是一样的,就是伪装成浏览器下载,多线程就是获取文件大小以后,用206状态码多线程下载最后合并,废话不说上代码

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import json
import os
import sys
import urllib.request
import random
import re
import gzip


def random_user_agent():
    _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
    _CHROME_VERSIONS = (
        '74.0.3729.129',
        '76.0.3780.3',
        '76.0.3780.2',
        '74.0.3729.128',
        '76.0.3780.1',
        '76.0.3780.0',
        '75.0.3770.15',
        '74.0.3729.127',
        '74.0.3729.126',
        '76.0.3779.1',
        '76.0.3779.0',
        '75.0.3770.14',
        '74.0.3729.125',
        '76.0.3778.1',
        '76.0.3778.0',
        '75.0.3770.13',
        '74.0.3729.124',
        '74.0.3729.123',
        '73.0.3683.121'
    )
    return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)


random_headers = {
    'User-Agent': random_user_agent(),
    'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate',
    'Accept-Language': 'en-us,en;q=0.5',
}

header = random_headers


def get_info(url, header):
    request = urllib.request.Request(url, headers=header)
    html_bytes = gzip.decompress(urllib.request.urlopen(request).read()
    html_info = html_bytes.decode('utf-8')
    return html_info


def download_file(web_url):
    type_name = web_url['mimeType'].split(";")[0].split('/')[0]
    ext_name = web_url['mimeType'].split(";")[0].split('/')[1]
    file_name = name_target[0]
    base_name = file_name + type_name + "." + ext_name
    print(base_name + " start download")
    web_url = web_url['url']
    if sys.platform == "win32":
        join_str = "\\"
    else:
        join_str = "/"
    request = urllib.request.Request(web_url, headers=header)
    file_bytes = urllib.request.urlopen(request)
    with open(base_name, "wb") as f:
        f.write(file_bytes.read())
    print(base_name + " finish download")
    current_name = os.getcwd() + join_str + base_name
    print(current_name)
    return current_name

test_url = "https://www.youtube.com/watch?v=N-bI8_Cc7E4"

html_info = get_info(test_url, header)
re_json = r'"streamingData":(.*?),"playbackTracking":'
re_name = r' - YouTube</title><meta name="title" content="(.*?)"><meta name="description"'
json_target = re.findall(re_json, html_info)
name_target = re.findall(re_name, html_info)

json_info = json.loads(json_target[0])

adapt_formats = json_info['adaptiveFormats']

for i in adapt_formats:
    if i["itag"] == 137 or i["itag"] == 140:
        download_file(i)

上面代码就是分别下载itag137和140的文件,137位1080p的mp4视频,140为mp4音频,因为Youtube上1080以上的文件都是音频视频分离的,所以下载以后需要合并,合并方法:

1
ffmpeg.exe -i src.audio.mp4 -i src.video.mp4 -vcodec copy -acodec copy dst.mp4

因为是选择的copy复制,所以很快,秒级合并。