Im folgenden Artikel habe ich den Code zum Herunterladen aus der URL-Liste der Reihe nach geschrieben und ihn so geändert, dass mehrere Bilder gleichzeitig heruntergeladen werden.
Bilder von der URL-Liste in Python herunterladen
Wählen Sie zufällig eine Datei aus der Liste aus Laden Sie mehrere Dateien gleichzeitig herunter Beschränken Sie Downloads von derselben Domain Gesperrt, damit das Schreiben von Dateien nicht gleichzeitig erfolgt
async_downloader.py
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import time, os, glob, random, requests, threading
headers = { 'User-Agent' : 'Mozilla/5.0' }
cwd = os.getcwd()
result_dir = cwd + '/download/'
list_dir = cwd + '/list/'
done_file = 'done.txt'
fail_file = 'fail.txt'
wait_sec = 1
max_download = 5
class FileHandler(object):
def __init__(self):
self.lock = threading.Lock()
self.file_list = glob.glob(list_dir + '*')
self.done_file = done_file
self.fail_file = fail_file
def clearEmptyRows(self):
self.lock.acquire()
for url_list in self.file_list:
with open(url_list, 'r') as f:
urls = f.read().split('\n')
while '' in urls:
urls.remove('')
with open(url_list, 'w') as f:
f.write('\n'.join(urls))
self.lock.release()
def hasFile(self):
return True if len(self.file_list) else False
def saveDone(self, url):
self.lock.acquire()
with open(self.done_file, 'a') as f:
f.write(url + '\n')
self.lock.release()
def saveFail(self, url):
self.lock.acquire()
with open(self.fail_file, 'a') as f:
f.write(url + '\n')
self.lock.release()
def extractDomain(self, url):
return url.replace('http://', '').replace('https://', '').split('/')[0]
def getUrl(self):
self.lock.acquire()
url_file = random.choice(self.file_list)
with open(url_file, 'r') as f:
urls = f.read().split('\n')
i = random.randrange(len(urls))
domain_new = self.extractDomain(urls[i])
for thread in threading.enumerate():
domain_old = self.extractDomain(thread.name)
if domain_new == domain_old:
self.lock.release()
return self.getUrl()
result = urls.pop(i)
if len(urls):
with open(url_file, 'w') as f:
f.write('\n'.join(urls))
else:
os.remove(url_file)
self.file_list.remove(url_file)
self.lock.release()
return result
def asyncDownload():
def saveImage(file_handler, response):
url = response.url
image = response.content
path_relative = url.replace('http://', '').replace('https://', '')
paths = os.path.split(path_relative)[0].split('/')
path_current = result_dir
for path in paths:
path_current += path + '/'
if not os.path.exists(path_current):
os.mkdir(path_current)
with open('{result_dir}{path_relative}'.format(result_dir = result_dir, path_relative = path_relative), 'wb') as f:
f.write(image)
def downloadImage(file_handler, url):
print('download ' + url)
try:
res = requests.get(url, headers = headers)
saveImage(file_handler, res)
file_handler.saveDone(url)
print('done ' + url)
except requests.exceptions.RequestException as e:
file_handler.saveFail(url)
print('fail ' + e)
file_handler = FileHandler()
file_handler.clearEmptyRows()
while True:
if not file_handler.hasFile():
break
else:
url = file_handler.getUrl()
threading.Thread(name = url, target = downloadImage, args = (file_handler, url)).start()
while threading.active_count() > max_download:
time.sleep(wait_sec)
asyncDownload()
Threading - Parallelverarbeitung durch Threads verwalten Herunterladen von Boobs-Bildern - Python-Version 2012
Recommended Posts