1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
|
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import json
import os
import sys
import urllib.request
import random
import re
import gzip
def random_user_agent():
_USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
_CHROME_VERSIONS = (
'74.0.3729.129',
'76.0.3780.3',
'76.0.3780.2',
'74.0.3729.128',
'76.0.3780.1',
'76.0.3780.0',
'75.0.3770.15',
'74.0.3729.127',
'74.0.3729.126',
'76.0.3779.1',
'76.0.3779.0',
'75.0.3770.14',
'74.0.3729.125',
'76.0.3778.1',
'76.0.3778.0',
'75.0.3770.13',
'74.0.3729.124',
'74.0.3729.123',
'73.0.3683.121'
)
return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
random_headers = {
'User-Agent': random_user_agent(),
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'en-us,en;q=0.5',
}
header = random_headers
def get_info(url, header):
request = urllib.request.Request(url, headers=header)
html_bytes = gzip.decompress(urllib.request.urlopen(request).read()
html_info = html_bytes.decode('utf-8')
return html_info
def download_file(web_url):
type_name = web_url['mimeType'].split(";")[0].split('/')[0]
ext_name = web_url['mimeType'].split(";")[0].split('/')[1]
file_name = name_target[0]
base_name = file_name + type_name + "." + ext_name
print(base_name + " start download")
web_url = web_url['url']
if sys.platform == "win32":
join_str = "\\"
else:
join_str = "/"
request = urllib.request.Request(web_url, headers=header)
file_bytes = urllib.request.urlopen(request)
with open(base_name, "wb") as f:
f.write(file_bytes.read())
print(base_name + " finish download")
current_name = os.getcwd() + join_str + base_name
print(current_name)
return current_name
test_url = "https://www.youtube.com/watch?v=N-bI8_Cc7E4"
html_info = get_info(test_url, header)
re_json = r'"streamingData":(.*?),"playbackTracking":'
re_name = r' - YouTube</title><meta name="title" content="(.*?)"><meta name="description"'
json_target = re.findall(re_json, html_info)
name_target = re.findall(re_name, html_info)
json_info = json.loads(json_target[0])
adapt_formats = json_info['adaptiveFormats']
for i in adapt_formats:
if i["itag"] == 137 or i["itag"] == 140:
download_file(i)
|