爬虫需求

下载指定网站上的所有pdf文档,如下图所示

image-20220719232436521

代码实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# main.py
import random
import time
from urllib import response
import urllib.request
import os
import sys
import re
import time
# 需运行 pip install Tenacity
from tenacity import retry, stop_after_attempt
from concurrent.futures import ThreadPoolExecutor, as_completed
BASE_URL = 'https://www.pfeiffer-vacuum.com/zh/%E4%B8%8B%E8%BD%BD%E4%B8%AD%E5%BF%83/container.action?searchQuery=&categoryChoice=15&categoryChoice=%E6%93%8D%E4%BD%9C%E6%89%8B%E5%86%8C&categoryChoice=25&categoryChoice=19&categoryChoice=23&categoryChoice=%E7%BB%B4%E6%8A%A4%E8%AF%B4%E6%98%8E&categoryChoice=%E5%AE%A3%E4%BC%A0%E5%86%8C&categoryChoice=60&categoryChoice=56&categoryChoice=50&categoryChoice=9&categoryChoice=4&categoryChoice=12&categoryChoice=1&categoryChoice=27&categoryChoice=3&categoryChoice=7&categoryChoice=2&categoryChoice=8&categoryChoice=62&categoryChoice=57&categoryChoice=5&categoryChoice=11&categoryChoice=13&categoryChoice=10&categoryChoice=14&categoryChoice=54&categoryChoice=55&categoryChoice=34&categoryChoice=36&categoryChoice=33&categoryChoice=35&categoryChoice=41&categoryChoice=42&categoryChoice=48&categoryChoice=39&categoryChoice=40&categoryChoice=43&categoryChoice=45&categoryChoice=61&categoryChoice=49&categoryChoice=37&search=true&page='
BOOK_LINK_PATTERN = 'href=".*=(.*.pdf)" class="download" target="_blank"'
MAXPAGE = 177


# 获取第page页的所有pdf下载链接
def getDownLoadLink(page):
req = getReq(BASE_URL + str(page))
html = urllib.request.urlopen(req)
doc = html.read().decode('utf8')
url_list = list(set(re.findall(BOOK_LINK_PATTERN, doc)))
ret = []
for v in url_list:
url = "https://www.pfeiffer-vacuum.com" + \
v+"?request_locale=zh_CN&referer=2063"
# print(f'get {url} ')
fileName = ""
for i in reversed(v):
if i == '/':
break
fileName += i
fileName = fileName[::-1]
ret.append((url, fileName))
return ret, page


agnetsList = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
"Mozilla/5.0 (Linux; Android 4.0.4; Galaxy Nexus Build/IMM76B) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.133 Mobile Safari/535.19",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36"
]


# 模拟客户连接请求
def getReq(url):
headers = {
'Accept': '*/*',
'Referer': 'https://www.pfeiffer-vacuum.com',
'User-Agent': random.choice(agnetsList)
}
return urllib.request.Request(url, headers=headers)


# 下载单个链接的pdf文件
@retry(stop=stop_after_attempt(6)) # 若下载失败会自动重试 最多重试六次
def downloadPdf(url, fileName, savePath, page):
req = getReq(url)
with urllib.request.urlopen(req) as conn:
buf = conn.read()
file = open(savePath+"/"+fileName, 'wb')
file.write(buf)
file.close()
print(
f'The page {page} file:{fileName} download is complete')


# 并发下载给定页面上的所有链接
def downloadParallel(links, page):
print(
f'********* Page {page} starts the download *********')
savePath = './data/page_'+str(page)
if not os.path.exists(savePath):
os.mkdir(savePath)
startTime = time.time()
with ThreadPoolExecutor() as pool:
# submit tasks
futures = [pool.submit(downloadPdf, url, fileName, savePath, page)
for url, fileName in links]
for future in as_completed(futures):
result = future.result()
print(
f'********* page {page} download completed! The time is {time.time()-startTime:.2f} s *********')


# 动态打印消息
def msg(txt):
CURSOR_UP_ONE = '\x1b[1A'
ERASE_LINE = '\x1b[2K'
print(txt)
sys.stdout.write(CURSOR_UP_ONE)
sys.stdout.write(ERASE_LINE)


# 并发获取所有页面的所有pdf下载链接
def getLinks():

links = [0]*(MAXPAGE+1)
cnt = 0
with ThreadPoolExecutor() as pool:
futures = [pool.submit(getDownLoadLink, page)
for page in range(1, MAXPAGE+1)]
for future in as_completed(futures):
res, id = future.result()
links[id] = res
cnt += 1
msg(f'get links {cnt}/176...')
return links


# 并发下载所有链接
def download(links):
with ThreadPoolExecutor() as pool:
# submit tasks
futures = [pool.submit(downloadParallel, links[page], page)
for page in range(1, MAXPAGE+1)]
# get results as they are available
for future in as_completed(futures):
# get the result
result = future.result()


if __name__ == "__main__":
if not os.path.exists('data'):
os.mkdir('data')
bathSize = 10
startTime = time.time()
links = getLinks() # 获取所有待下载pdf链接
download(links) # 开始下载
print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n")
print(
f"\t\t\t all page completed! The total time is {time.time()-startTime:.2f} s\n")
print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n")

运行结果

运行命令

1
2
pip install Tenacity
python3 main.py

结果图

  • 获取下载链接

image-20220719233122293

  • 开始下载

image-20220719233649164