[PYTHON] Visualization of Produce 101 Japan trainee ranking by scraping

What is Produce 101 Japan?

PRODUCE 101 JAPAN OFFICIAL SITE This is the Japanese version of the audition program imported from South Korea, and the results of voting for trainees who want to make their debut as a singer are announced weekly. As the week progresses, the number of dropouts will increase to 60th and 35th.

This time, the ranking results are extracted from the official website by scraping. I tried to visualize the change in the ranking of the trainees who survived by the latest ranking (9th week as of November 29, 2019).

Completion drawing

Some of the trainees' names are hidden. Unknown.png

Main flow

  1. Scraping
  2. Data shaping
  3. Visualize ranking

1. Get ranking by scraping

Functionalized so that weekly rankings can be collected individually. Get HTML elements using BeautifulSoup and convert them to text. Get in the form of "rank, name, week" (only part of it is shown). aaa.png


def getWeeklyRank(week):
    import requests
    from bs4 import BeautifulSoup
    import re

    #Get the URL of the ranking page by formatting with numbers
    url = 'https://produce101.jp/rank/?week={}'
    html = requests.get(url.format(week))   
    #Handle URLs with Beautiful Soup
    soup = BeautifulSoup(html.text, 'lxml')
    #Get span and div elements for a particular class
    span_rank = soup.find_all("span", class_="icon-rank")
    div_name = soup.find_all("div", class_="name")
    
    #Extract the text component of the tag containing rank and name into the list
    rank = []
    for i in range(len(span_rank)):
        rank.append(int(span_rank[i].text))
    name = []
    for i in range(len(div_name)):
        name.append(div_name[i].text)

    #Save weekly Ranking to csv
    #Create new only in the first week and write in additional mode from next week
    if week == 1:
        f = open('./weeklyRank.txt', 'w')
        for i in range(len(rank)):
            f.write(str(rank[i])+','+str(name[i])+','+str(week)+'\n')
        f.close()
    elif week > 1:
        f = open('./weeklyRank.txt', 'a')
        for i in range(len(rank)):
            f.write(str(rank[i])+','+str(name[i])+','+str(week)+'\n')
        f.close()

Execute the function weekly and get the ranking. (It's cool to get everything automatically, but this time I'll get it steadily.)

getWeeklyRank(1)
getWeeklyRank(2)
getWeeklyRank(3)
#No ranking announced in the 4th week
getWeeklyRank(5)
getWeeklyRank(6)
#No ranking announced in the 7th week
getWeeklyRank(8)
getWeeklyRank(9)

2. Make it easy to graph by data shaping

Delete the elements other than the name such as "* decline", and format the column heading as week and enter the ranking. Replace the rank of the trainees who dropped out on the way with "x". beforeFormat.png

#Erase the notation of decline

f = open('weeklyRank.txt', 'r')
data_lines = f.read()
data_lines = data_lines.replace('* Decline', '')
f.close()

f = open('weeklyRank.txt', 'w')
f.write(data_lines)
f.close()

Formats the ranking data obtained from HTML. There are cuts up to 60th place in the 5th week and up to 35th place in the 8th week, and the number of people will change, so we will respond individually.

def getWeeklyRank_format(data_path):
    import pandas as pd
    df_rank = pd.read_csv(data_path,header=None, names=('rank', 'name', 'week'))
    df = df_rank[['name','week','rank']]
    df_week1 = df_rank[df_rank['week'] == 1]
    df_week5 = df_rank[df_rank['week'] == 5]
    df_week8 = df_rank[df_rank['week'] == 8]
    f = open('./weeklyRank_format.txt', 'w')
    f.write('week')

    #Get a member of week1
    name_week1 = []
    for e in range(len(df_week1)):
        dfe = df[(df['week'] == 1) & (df['rank'] == e+1)]
        nameArray = dfe['name'].values[0]
        f.write(str(','+nameArray))
        name_week1.append(str(nameArray))
    #Get a member of week5
    name_week5 = []
    for e in range(len(df_week5)):
        dfe = df[(df['week'] == 5) & (df['rank'] == e+1)]
        nameArray = dfe['name'].values[0]
        name_week5.append(str(nameArray))
    f.write('\n') 
    #Get a member of week8
    name_week8 = []
    for e in range(len(df_week8)):
        dfe = df[(df['week'] == 8) & (df['rank'] == e+1)]
        nameArray = dfe['name'].values[0]
        name_week8.append(str(nameArray))
    f.write('\n') 
    #Enter the rank of trainees in the first week as column headings and the ranks after that as variables.
    for i in range(1,10):
        if i==1 or i==2 or i==3:
            #Write week in column 0
            f.write(str(i))
            #Next, get the trainee's ranking in the first week
            for j in range(0, len(name_week1)):
                dfi = df[(df['week'] == i) & (df['name'] == name_week1[j])]
                f.write(str(','+str(dfi['rank'].values[0])))
        elif i==4:
            continue
        elif i==5 or i==6:
            #Write week in column 0
            f.write(str(i))
            #Next, get the trainee's ranking in the first week
            for j in range(0, len(name_week1)):
                if name_week1[j] in name_week5:
                    dfk = df[(df['week'] == i) & (df['name'] == name_week1[j])]
                    f.write(str(','+str(dfk['rank'].values[0])))   
                elif name_week1[j] not in name_week5:
                    f.write(',x')
        elif i==7:
            continue
        elif i==8 or i==9:
            #Write week in column 0
            f.write(str(i))
            #Next, get the trainee's ranking in the first week
            for j in range(0, len(name_week1)):
                if name_week1[j] in name_week8:
                    dfk = df[(df['week'] == i) & (df['name'] == name_week1[j])]
                    f.write(str(','+str(dfk['rank'].values[0])))   
                elif name_week1[j] not in name_week8:
                    f.write(',x')            
        f.write('\n') 
    f.close()

Execute the function.

getWeeklyRank_format('./weeklyRank.txt')

Let's see if it worked.

import pandas as pd
df_rank = pd.read_csv('./weeklyRank_format.txt',header=0)
df_rank
df.png

3. Visualize the ranking

This time, I will use Downloaded font: JK Gothic L to write in Japanese, keeping in mind the appearance.

#Trainee ranking from 1st week to 9th week
#Customize fonts
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np

#Apply fonts by directly specifying ttf files
import matplotlib.font_manager
fp = matplotlib.font_manager.FontProperties(fname='/Users/[USER NAME]/.matplotlib/fonts/ttf/JKG-L_3.ttf')

#Set field
fig, axs = plt.subplots(figsize=(10,25))
x = df_rank['week']
axs.set_xlim(0.94,9.1)
axs.set_xticks([1, 2, 3, 4, 5, 6, 7, 8, 9])
axs.set_ylim(99, 0.6)

axs2 = axs.twinx()

labels = list(df_rank.columns[1:])[0:]
axs.set_yticks(list(np.arange(1,99)))
axs.set_yticklabels(labels, fontproperties=fp, color='darkslateblue')
axs.set_xticklabels(['1st week', '2nd week', '3rd week','4th week', '5th week', '6th week', '7th week', '8th week', '9th week'], rotation=0, fontsize=14, fontproperties=fp, color='darkslateblue')
axs.spines['top'].set_visible(False)
axs.spines['bottom'].set_visible(False)
axs.spines['right'].set_visible(False)
axs.spines['left'].set_visible(False)
axs.tick_params(left=False)

labels2 = list((np.arange(0,99)))
axs2.set_yticks(list(np.arange(1,99)))
axs2.set_yticklabels(labels2[99:0:-1], fontproperties=fp, color='darkslateblue')
axs2.set_ylim(0,98)
axs2.spines['top'].set_visible(False)
axs2.spines['bottom'].set_visible(False)
axs2.spines['right'].set_visible(False)
axs2.spines['left'].set_visible(False)
axs2.tick_params(right=False)

#Change the color of the polygonal line to rainbow
cmap = plt.get_cmap('rainbow')
for i in range(1, 99,1):
    y = df_rank[df_rank.columns[i]]
    if 'x' in list(y):
        continue
    else:
        axs.plot(x,y,color=cmap(1-i/100),marker='o',markersize=8,linewidth = 3, alpha=0.3)

This should complete the graph. If you set the conditions, you can visualize the trainees who have greatly improved their ranking.

Recommended Posts

Visualization of Produce 101 Japan trainee ranking by scraping
Visualization of data by prefecture
Visualization of matrix created by numpy
Get a list of Qiita likes by scraping
Analysis of financial data by pandas and its visualization (2)
Analysis of financial data by pandas and its visualization (1)
Visualization method of data by explanatory variable and objective variable