import time
import unicodedata
from urllib.parse import urljoin
import re
import requests
from bs4 import BeautifulSoup
def cleaning(info, team, html):
result = []
for trs in html:
data = [i.get_text(strip=True) for i in trs.select("th, td")]
#Berechnen Sie die Verlängerungszeit nach dem Entfernen von Minuten
data[0] = eval(data[0].rstrip("Protokoll"))
#PK des Spielernamens entfernt
data[2] = re.sub("\(.+\)", "", unicodedata.normalize("NFKC", data[2])).strip()
result.append(info + [team] + data)
return result
def scraping(n, url):
r = requests.get(url)
r.raise_for_status()
soup = BeautifulSoup(r.content, "html5lib")
#Sektion
score_season = soup.select_one(
"div.score-header > h2.score-meta > span.score-season"
).get_text(strip=True)
score_season = int(score_season.strip("Sektion"))
# print(score_season)
#Datum (und Uhrzeit
score_date = (
soup.select_one("div.score-header > h2.score-meta > span.score-date")
.get_text(strip=True)
.split()
)
# print(score_date)
#Teamname
score_table = soup.select_one("table.score-table")
home_team = score_table.select_one("th.score-team1").get_text(strip=True)
away_team = score_table.select_one("th.score-team2").get_text(strip=True)
# print(home_team, away_team)
#Spielinformationen
game_info = [n, score_season] + score_date + [home_team, away_team]
#Ergebnis
tag = soup.find("h3", text="Ergebnis")
#Überprüfen Sie, ob es sich um eine Wertungstabelle handelt
if tag:
table_home = [
trs
for trs in tag.parent.select(
"div.score-frame > div.score-left > table > tbody > tr"
)
]
home_data = cleaning(game_info, home_team, table_home)
table_away = [
trs
for trs in tag.parent.select(
"div.score-frame > div.score-right > table > tbody > tr"
)
]
away_data = cleaning(game_info, away_team, table_away)
score_data = home_data + away_data
return score_data
return None
url = "http://www.jfl.or.jp/jfl-pc/view/s.php?a=1542&f=2020A001_spc.html"
r = requests.get(url)
r.raise_for_status()
soup = BeautifulSoup(r.content, "html5lib")
links = [urljoin(url, link.get("href")) for link in soup.select("td.detail-link > a") if link.text == "Einzelheiten"]
result = []
for i, link in enumerate(links):
score_data = scraping(i, link)
if score_data:
result.extend(score_data)
time.sleep(1)
import pandas as pd
df = pd.DataFrame(result, columns=["Spiel", "Sektion", "Datum", "Tageszeiten", "Zuhause", "Weg", "Teamname", "Zeit", "Einheitliche Nummer", "Spielername"])
df
df["Ergebnis"] = 1
#Rangliste der Zielnummern
pv_goal = df.pivot_table(
values="Ergebnis", index=["Spielername", "Teamname", "Einheitliche Nummer"], aggfunc=sum, fill_value=0
).drop(["Eigenes Ziel"]).reset_index()
pv_goal["Einheitliche Nummer"] = pv_goal["Einheitliche Nummer"].astype(int)
#Rangfolge
pv_goal["Rangfolge"] = pv_goal["Ergebnis"].rank(ascending=False, method="min").astype(int)
#Mannschaft
jfl_2020 = [
"Honda FC",
"Sony Sendai FC",
"Tokyo Musashino City FC",
"Tegevajaro Miyazaki",
"Honda Lock SC",
"Verspa Oita",
"FC Osaka",
"MIO Biwako Shiga",
"Viatin Mie",
"FC Maruyasu Okazaki",
"Suzuka Point Getters",
"Line Mail Aomori",
"Nara Club",
"Matsue City FC",
"Iwaki FC",
"Kochi United SC",
]
team = {name: i for i, name in enumerate(jfl_2020, 1)}
pv_goal["Team ID"] = pv_goal["Teamname"].map(team)
#Aufsteigende Reihenfolge nach Rang, Teamname, Spielername
pv_goal.sort_values(["Rangfolge", "Team ID", "Einheitliche Nummer"], ascending=[True, True, True], inplace=True)
pv_goal.drop(["Team ID", "Einheitliche Nummer"], axis=1, inplace=True)
pv_goal.set_index("Rangfolge", inplace=True)
pv_goal.to_csv("goal.csv")
df_rank = pd.read_html("http://www.jfl.or.jp/jfl-pc/view/s.php?a=1544", index_col=0, header=0)[0]
df_rank["Spielername"] = df_rank["Spielername"].str.normalize("NFKC")
df_rank.to_csv("ranking.csv")
Recommended Posts