python爬虫并发下载pdf链接

爬虫需求

下载指定网站上的所有pdf文档，如下图所示
代码实现

# main.py
import random
import time
from urllib import response
import urllib.request
import os
import sys
import re
import time
# 需运行 pip install Tenacity
from tenacity import retry, stop_after_attempt
from concurrent.futures import ThreadPoolExecutor, as_completed
BASE_URL = 'https://www.pfeiffer-vacuum.com/zh/%E4%B8%8B%E8%BD%BD%E4%B8%AD%E5%BF%83/container.action?searchQuery=&categoryChoice=15&categoryChoice=%E6%93%8D%E4%BD%9C%E6%89%8B%E5%86%8C&categoryChoice=25&categoryChoice=19&categoryChoice=23&categoryChoice=%E7%BB%B4%E6%8A%A4%E8%AF%B4%E6%98%8E&categoryChoice=%E5%AE%A3%E4%BC%A0%E5%86%8C&categoryChoice=60&categoryChoice=56&categoryChoice=50&categoryChoice=9&categoryChoice=4&categoryChoice=12&categoryChoice=1&categoryChoice=27&categoryChoice=3&categoryChoice=7&categoryChoice=2&categoryChoice=8&categoryChoice=62&categoryChoice=57&categoryChoice=5&categoryChoice=11&categoryChoice=13&categoryChoice=10&categoryChoice=14&categoryChoice=54&categoryChoice=55&categoryChoice=34&categoryChoice=36&categoryChoice=33&categoryChoice=35&categoryChoice=41&categoryChoice=42&categoryChoice=48&categoryChoice=39&categoryChoice=40&categoryChoice=43&categoryChoice=45&categoryChoice=61&categoryChoice=49&categoryChoice=37&search=true&page='
BOOK_LINK_PATTERN = 'href=".*=(.*.pdf)" class="download" target="_blank"'
MAXPAGE = 177


# 获取第page页的所有pdf下载链接
def getDownLoadLink(page):
    req = getReq(BASE_URL + str(page))
    html = urllib.request.urlopen(req)
    doc = html.read().decode('utf8')
    url_list = list(set(re.findall(BOOK_LINK_PATTERN, doc)))
    ret = []
    for v in url_list:
        url = "https://www.pfeiffer-vacuum.com" + \
            v+"?request_locale=zh_CN&referer=2063"
        # print(f'get {url} ')
        fileName = ""
        for i in reversed(v):
            if i == '/':
                break
            fileName += i
        fileName = fileName[::-1]
        ret.append((url, fileName))
    return ret, page


agnetsList = [
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
    "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
    "Mozilla/5.0 (Linux; Android 4.0.4; Galaxy Nexus Build/IMM76B) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.133 Mobile Safari/535.19",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36"
]


# 模拟客户连接请求
def getReq(url):
    headers = {
        'Accept': '*/*',
        'Referer': 'https://www.pfeiffer-vacuum.com',
        'User-Agent': random.choice(agnetsList)
    }
    return urllib.request.Request(url, headers=headers)


# 下载单个链接的pdf文件
@retry(stop=stop_after_attempt(6))  # 若下载失败会自动重试 最多重试六次
def downloadPdf(url, fileName, savePath, page):
    req = getReq(url)
    with urllib.request.urlopen(req) as conn:
        buf = conn.read()
    file = open(savePath+"/"+fileName, 'wb')
    file.write(buf)
    file.close()
    print(
        f'The page {page} file:{fileName} download is complete')


# 并发下载给定页面上的所有链接
def downloadParallel(links, page):
    print(
        f'********* Page {page} starts the download *********')
    savePath = './data/page_'+str(page)
    if not os.path.exists(savePath):
        os.mkdir(savePath)
    startTime = time.time()
    with ThreadPoolExecutor() as pool:
        # submit tasks
        futures = [pool.submit(downloadPdf, url, fileName, savePath, page)
                   for url, fileName in links]
        for future in as_completed(futures):
            result = future.result()
    print(
        f'********* page {page} download completed! The time is {time.time()-startTime:.2f} s *********')


# 动态打印消息
def msg(txt):
    CURSOR_UP_ONE = '\x1b[1A'
    ERASE_LINE = '\x1b[2K'
    print(txt)
    sys.stdout.write(CURSOR_UP_ONE)
    sys.stdout.write(ERASE_LINE)


# 并发获取所有页面的所有pdf下载链接
def getLinks():

    links = [0]*(MAXPAGE+1)
    cnt = 0
    with ThreadPoolExecutor() as pool:
        futures = [pool.submit(getDownLoadLink, page)
                   for page in range(1, MAXPAGE+1)]
        for future in as_completed(futures):
            res, id = future.result()
            links[id] = res
            cnt += 1
            msg(f'get links {cnt}/176...')
    return links


# 并发下载所有链接
def download(links):
    with ThreadPoolExecutor() as pool:
        # submit tasks
        futures = [pool.submit(downloadParallel, links[page], page)
                   for page in range(1, MAXPAGE+1)]
        # get results as they are available
        for future in as_completed(futures):
            # get the result
            result = future.result()


if __name__ == "__main__":
    if not os.path.exists('data'):
        os.mkdir('data')
    bathSize = 10
    startTime = time.time()
    links = getLinks()  # 获取所有待下载pdf链接
    download(links)  # 开始下载
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n")
    print(
        f"\t\t\t all page completed! The total time is {time.time()-startTime:.2f} s\n")
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n")
运行结果

运行命令

1 2	pip install Tenacity python3 main.py
结果图

获取下载链接
开始下载
爬虫需求

代码实现

运行结果

运行命令

结果图

获取下载链接

开始下载