import time
import unicodedata
from urllib.parse import urljoin
import re
import requests
from bs4 import BeautifulSoup
def cleaning(info, team, html):
    result = []
    for trs in html:
        data = [i.get_text(strip=True) for i in trs.select("th, td")]
        #Calculate overtime after removing minutes of time
        data[0] = eval(data[0].rstrip("Minutes"))
        #Deleted the PK of the player name
        data[2] = re.sub("\(.+\)", "", unicodedata.normalize("NFKC", data[2])).strip()
        result.append(info + [team] + data)
    return result
def scraping(n, url):
    r = requests.get(url)
    r.raise_for_status()
    soup = BeautifulSoup(r.content, "html5lib")
    #section
    score_season = soup.select_one(
        "div.score-header > h2.score-meta > span.score-season"
    ).get_text(strip=True)
    score_season = int(score_season.strip("Section"))
    # print(score_season)
    #Date and time
    score_date = (
        soup.select_one("div.score-header > h2.score-meta > span.score-date")
        .get_text(strip=True)
        .split()
    )
    # print(score_date)
    #Team name
    score_table = soup.select_one("table.score-table")
    home_team = score_table.select_one("th.score-team1").get_text(strip=True)
    away_team = score_table.select_one("th.score-team2").get_text(strip=True)
    # print(home_team, away_team)
    #Match information
    game_info = [n, score_season] + score_date + [home_team, away_team]
    #score
    tag = soup.find("h3", text="score")
    #Check if it is a scoring table
    if tag:
        table_home = [
            trs
            for trs in tag.parent.select(
                "div.score-frame > div.score-left > table > tbody > tr"
            )
        ]
        home_data = cleaning(game_info, home_team, table_home)
        table_away = [
            trs
            for trs in tag.parent.select(
                "div.score-frame > div.score-right > table > tbody > tr"
            )
        ]
        away_data = cleaning(game_info, away_team, table_away)
        score_data = home_data + away_data
        return score_data
    return None
url = "http://www.jfl.or.jp/jfl-pc/view/s.php?a=1542&f=2020A001_spc.html"
r = requests.get(url)
r.raise_for_status()
soup = BeautifulSoup(r.content, "html5lib")
links = [urljoin(url, link.get("href")) for link in soup.select("td.detail-link > a") if link.text == "Details"]
result = []
for i, link in enumerate(links):
    score_data = scraping(i, link)
    if score_data:
        result.extend(score_data)
    
    time.sleep(1)
import pandas as pd
df = pd.DataFrame(result, columns=["match", "section", "date", "Times of Day", "home", "Away", "Team name", "time", "Uniform number", "Player name"])
df
df["score"] = 1
#Goal number ranking
pv_goal = df.pivot_table(
    values="score", index=["Player name", "Team name", "Uniform number"], aggfunc=sum, fill_value=0
).drop(["Own goal"]).reset_index()
pv_goal["Uniform number"] = pv_goal["Uniform number"].astype(int)
#Ranking
pv_goal["Ranking"] = pv_goal["score"].rank(ascending=False, method="min").astype(int)
#team
jfl_2020 = [
    "Honda FC",
    "Sony Sendai FC",
    "Tokyo Musashino City FC",
    "Tegevajaro Miyazaki",
    "Honda Lock SC",
    "Verspah Oita",
    "FC Osaka",
    "MIO Biwako Shiga",
    "Veertien Mie",
    "FC Maruyasu Okazaki",
    "Suzuka Point Getters",
    "Line mail Aomori",
    "Nara club",
    "Matsue City FC",
    "Iwaki FC",
    "Kochi United SC",
]
team = {name: i for i, name in enumerate(jfl_2020, 1)}
pv_goal["Team ID"] = pv_goal["Team name"].map(team)
#Ascending order by ranking, team name, player name
pv_goal.sort_values(["Ranking", "Team ID", "Uniform number"], ascending=[True, True, True], inplace=True)
pv_goal.drop(["Team ID", "Uniform number"], axis=1, inplace=True)
pv_goal.set_index("Ranking", inplace=True)
pv_goal.to_csv("goal.csv")
df_rank = pd.read_html("http://www.jfl.or.jp/jfl-pc/view/s.php?a=1544", index_col=0, header=0)[0]
df_rank["Player name"] = df_rank["Player name"].str.normalize("NFKC")
df_rank.to_csv("ranking.csv")
        Recommended Posts