Ich habe es grob geschrieben.
async_web.py
# -*- coding: utf-8 -*-
from __future__ import absolute_import, unicode_literals
import aiohttp
import asyncio
async def download_file(title, url):
"""
URL herunterladen und in Datei speichern
:param title: str
:param url: str
"""
local_filename = title + ".txt"
with aiohttp.ClientSession() as client:
async with client.get(url) as resp:
assert resp.status == 200
data = await resp.text()
await save_file(local_filename, data)
return local_filename
async def save_file(filename, text):
"""
Text in Datei speichern
:param filename: str
:param text: str
"""
path = "./data/{}".format(filename)
f = open(path, 'w')
f.write(text)
f.close()
async def task_download(url, title):
await download_file(url, title)
urls = [
['Yakult', 'https://ja.wikipedia.org/wiki/%E6%9D%B1%E4%BA%AC%E3%83%A4%E3%82%AF%E3%83%AB%E3%83%88%E3%82%B9%E3%83%AF%E3%83%AD%E3%83%BC%E3%82%BA'],
['Riese', 'https://ja.wikipedia.org/wiki/%E8%AA%AD%E5%A3%B2%E3%82%B8%E3%83%A3%E3%82%A4%E3%82%A2%E3%83%B3%E3%83%84'],
['Hanshin', 'https://ja.wikipedia.org/wiki/%E9%98%AA%E7%A5%9E%E3%82%BF%E3%82%A4%E3%82%AC%E3%83%BC%E3%82%B9'],
['Hiroshima', 'https://ja.wikipedia.org/wiki/%E5%BA%83%E5%B3%B6%E6%9D%B1%E6%B4%8B%E3%82%AB%E3%83%BC%E3%83%97'],
['Chunichi', 'https://ja.wikipedia.org/wiki/%E4%B8%AD%E6%97%A5%E3%83%89%E3%83%A9%E3%82%B4%E3%83%B3%E3%82%BA'],
['Yokohama', 'https://ja.wikipedia.org/wiki/%E6%A8%AA%E6%B5%9CDeNA%E3%83%99%E3%82%A4%E3%82%B9%E3%82%BF%E3%83%BC%E3%82%BA'],
['Sofban', 'https://ja.wikipedia.org/wiki/%E7%A6%8F%E5%B2%A1%E3%82%BD%E3%83%95%E3%83%88%E3%83%90%E3%83%B3%E3%82%AF%E3%83%9B%E3%83%BC%E3%82%AF%E3%82%B9'],
['Sonnenschinken', 'https://ja.wikipedia.org/wiki/%E5%8C%97%E6%B5%B7%E9%81%93%E6%97%A5%E6%9C%AC%E3%83%8F%E3%83%A0%E3%83%95%E3%82%A1%E3%82%A4%E3%82%BF%E3%83%BC%E3%82%BA'],
['Lotte', 'https://ja.wikipedia.org/wiki/%E5%8D%83%E8%91%89%E3%83%AD%E3%83%83%E3%83%86%E3%83%9E%E3%83%AA%E3%83%BC%E3%83%B3%E3%82%BA'],
['Seibu', 'https://ja.wikipedia.org/wiki/%E5%9F%BC%E7%8E%89%E8%A5%BF%E6%AD%A6%E3%83%A9%E3%82%A4%E3%82%AA%E3%83%B3%E3%82%BA'],
['ORIX', 'https://ja.wikipedia.org/wiki/%E3%82%AA%E3%83%AA%E3%83%83%E3%82%AF%E3%82%B9%E3%83%BB%E3%83%90%E3%83%95%E3%82%A1%E3%83%AD%E3%83%BC%E3%82%BA'],
['Rakuten', 'https://ja.wikipedia.org/wiki/%E6%9D%B1%E5%8C%97%E6%A5%BD%E5%A4%A9%E3%82%B4%E3%83%BC%E3%83%AB%E3%83%87%E3%83%B3%E3%82%A4%E3%83%BC%E3%82%B0%E3%83%AB%E3%82%B9'],
['Japanische Fußballnationalmannschaft', 'https://ja.wikipedia.org/wiki/%E3%82%B5%E3%83%83%E3%82%AB%E3%83%BC%E6%97%A5%E6%9C%AC%E4%BB%A3%E8%A1%A8'],
]
loop = asyncio.get_event_loop()
tasks = asyncio.wait([task_download(title, url) for title, url in urls])
loop.run_until_complete(tasks)
loop.close()
■ Threading graph
■ Asyncio graph
Wenn Sie sich das Asyncio-Diagramm ansehen, ist der Task15-27-Teil der Teil, in dem die HTTP-Kommunikation mit "aiohttp" ausgeführt wird, aber er funktioniert, indem die Aufgaben gut gewechselt werden. Es stellt sich jedoch heraus, dass die nachfolgende Verarbeitung Zeit braucht. Da das Schreiben der Funktion "save_file" auf die Festplatte die E / A blockiert, wird davon ausgegangen, dass das Warten auf das Schreiben lange gedauert hat. Mit anderen Worten, das Umschreiben der Funktion "save_file" mit nicht blockierenden E / A scheint dies zu verbessern.
Ich wollte die Funktion "save_file", die mit nicht blockierendem E / A auf die Festplatte geschrieben wurde, neu schreiben, wusste aber nicht, wie es geht, und schrieb sie daher mit dem altmodischen Mehrprozessprozess neu.
async_web_mp.py
# -*- coding: utf-8 -*-
from __future__ import absolute_import, unicode_literals
import random
import aiohttp
import asyncio
import multiprocessing as mp
async def task_download(i, title, url):
local_filename = title + "_mp.txt"
with aiohttp.ClientSession() as client:
async with client.get(url) as resp:
assert resp.status == 200
# print(await resp.text())
data = await resp.text()
print(i, title, url)
process = mp.Process(target=save_file, args=(local_filename, data))
process.start()
return local_filename
def save_file(filename, text):
"""
Text in Datei speichern
:param filename: str
:param text: str
"""
path = "./data/{}".format(filename)
f = open(path, 'w')
f.write(text)
f.close()
n = "?{}".format(str(random.randint(1, 100000))) #Caching verhindern
urls = [
['Yakult', 'https://ja.wikipedia.org/wiki/%E6%9D%B1%E4%BA%AC%E3%83%A4%E3%82%AF%E3%83%AB%E3%83%88%E3%82%B9%E3%83%AF%E3%83%AD%E3%83%BC%E3%82%BA'],
['Riese', 'https://ja.wikipedia.org/wiki/%E8%AA%AD%E5%A3%B2%E3%82%B8%E3%83%A3%E3%82%A4%E3%82%A2%E3%83%B3%E3%83%84'],
['Hanshin', 'https://ja.wikipedia.org/wiki/%E9%98%AA%E7%A5%9E%E3%82%BF%E3%82%A4%E3%82%AC%E3%83%BC%E3%82%B9'],
['Hiroshima', 'https://ja.wikipedia.org/wiki/%E5%BA%83%E5%B3%B6%E6%9D%B1%E6%B4%8B%E3%82%AB%E3%83%BC%E3%83%97'],
['Chunichi', 'https://ja.wikipedia.org/wiki/%E4%B8%AD%E6%97%A5%E3%83%89%E3%83%A9%E3%82%B4%E3%83%B3%E3%82%BA'],
['Yokohama', 'https://ja.wikipedia.org/wiki/%E6%A8%AA%E6%B5%9CDeNA%E3%83%99%E3%82%A4%E3%82%B9%E3%82%BF%E3%83%BC%E3%82%BA'],
['Sofban', 'https://ja.wikipedia.org/wiki/%E7%A6%8F%E5%B2%A1%E3%82%BD%E3%83%95%E3%83%88%E3%83%90%E3%83%B3%E3%82%AF%E3%83%9B%E3%83%BC%E3%82%AF%E3%82%B9'],
['Sonnenschinken', 'https://ja.wikipedia.org/wiki/%E5%8C%97%E6%B5%B7%E9%81%93%E6%97%A5%E6%9C%AC%E3%83%8F%E3%83%A0%E3%83%95%E3%82%A1%E3%82%A4%E3%82%BF%E3%83%BC%E3%82%BA'],
['Lotte', 'https://ja.wikipedia.org/wiki/%E5%8D%83%E8%91%89%E3%83%AD%E3%83%83%E3%83%86%E3%83%9E%E3%83%AA%E3%83%BC%E3%83%B3%E3%82%BA'],
['Seibu', 'https://ja.wikipedia.org/wiki/%E5%9F%BC%E7%8E%89%E8%A5%BF%E6%AD%A6%E3%83%A9%E3%82%A4%E3%82%AA%E3%83%B3%E3%82%BA'],
['ORIX', 'https://ja.wikipedia.org/wiki/%E3%82%AA%E3%83%AA%E3%83%83%E3%82%AF%E3%82%B9%E3%83%BB%E3%83%90%E3%83%95%E3%82%A1%E3%83%AD%E3%83%BC%E3%82%BA'],
['Rakuten', 'https://ja.wikipedia.org/wiki/%E6%9D%B1%E5%8C%97%E6%A5%BD%E5%A4%A9%E3%82%B4%E3%83%BC%E3%83%AB%E3%83%87%E3%83%B3%E3%82%A4%E3%83%BC%E3%82%B0%E3%83%AB%E3%82%B9'],
['Japanische Fußballnationalmannschaft', 'https://ja.wikipedia.org/wiki/%E3%82%B5%E3%83%83%E3%82%AB%E3%83%BC%E6%97%A5%E6%9C%AC%E4%BB%A3%E8%A1%A8'],
]
loop = asyncio.get_event_loop()
tasks = asyncio.wait([task_download(i, x[0], x[1] + n) for i, x in enumerate(urls)])
loop.run_until_complete(tasks)
loop.close()
Es verbesserte sich auf 11,5 Sekunden >> 5,5 Sekunden.
jawiki-latest-all-titles-in-ns0
async_web_counter.py
# -*- coding: utf-8 -*-
import threading
import time
import urllib
class WikipediaCrawler(object):
"""
Klasse, die auf die Wikipedia-URL reagiert
"""
PATH = './data/jawiki-latest-all-titles-in-ns0'
def __init__(self):
self.lock = threading.Lock()
def get_url(self):
f = open(WikipediaCrawler.PATH, 'r')
for title in f:
time.sleep(0.5)
if not self.lock.acquire(timeout=3):
#Wenn Sie das Schloss nicht bekommen können
print('%s: Cannot acquire lock (timed out)' % t.name)
continue
#Ich habe das Schloss
try:
time.sleep(0.5)
url = WikipediaCrawler.get_url(title)
print(url)
yield url
finally:
#Freischalten
self.lock.release()
@classmethod
def get_wikipedia_url(cls, title):
"""
Generieren Sie eine Wikipedia-URL aus dem Titel
"""
_base_url = "https://ja.wikipedia.org/wiki/{}"
url = _base_url.format(urllib.quote_plus(title))
return url[:-3]
def worker(crawler):
t = threading.current_thread()
for url in crawler.get_url():
print(url)
threads_count = 3
threads = []
crawler = WikipediaCrawler()
for i in range(threads_count):
t = threading.Thread(target=worker, args=(crawler,))
threads.append(t)
t.start()
for t in threads:
t.join()
What’s New in PyCharm 5 [Erkunden der Python-Standardbibliothek (18) ~ threading.Lock ~](http://mocobeta-backup.tumblr.com/post/86764185357/python-%E6%A8%99%E6%BA%96%E3%83 % A9% E3% 82% A4% E3% 83% 96% E3% 83% A9% E3% 83% AA% E6% 8E% A2% E8% A8% AA-18-Gewindeschloss% E7% B7% A8)
Recommended Posts