Einführung in die Datenanalyse mit Python P32-P43 [ch02 3.US Babynamen 1880-2010]

Umgebung
- win10 64bit
- Python 3.5.2
- Anaconda 4.1.1 (64-bit)
Buch
- Python for Data Analysis( by Wes McKinney )(Final Release Date: October 2012 )
Daten
- https://github.com/wesm/pydata-book
# %load ipython_log.py
# IPython log file
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd


# __Versuchen Sie zunächst, eine Datei in den Frame zu importieren__________________________
names1880=pd.read_csv('yob1880.txt',names=['name', 'sex', 'births'])
names1880.groupby('sex').births.sum()


# __Alles yob nantoka.Wenn Sie txt in einen Frame integrieren möchten__________________________
pieces=[]
columns=['name', 'sex', 'births']
years=range(1880,2011)
for year in years:
	path='yob%d.txt' % year
	frame=pd.read_csv(path,names=columns)
	frame['year']=year
	pieces.append(frame)
	names=pd.concat(pieces,ignore_index=True)   #Index ignorieren Wenn dieser Wert auf True gesetzt ist, wird der Index in der linken Textspalte entsprechend geändert.
					 #Ein riesiger Rahmen von 1690783 Zeilen wird erstellt. Die Suche ist immer noch schnell


# __Veränderungen in der Anzahl der Geburten__________________________
total_births=names.pivot_table(names, index='year', columns='sex', aggfunc=sum)
'''
Im Buch
 total_births = names.pivot_table('births', rows='year',cols='sex', aggfunc=sum)
Aber das ist alt
 rows=>index
 cols=>columns
Gewechselt zu
Sie müssen einen Datenrahmen in das erste Argument einfügen
 (Im zweiten Argument'birth'Ich dachte, ich müsste es hineinstecken, aber es wurde abgestoßen ...?)
'''
total_births.tail()
total_births.plot(title='Total births by sex and year')
'''Versuchen Sie, die Anzahl der Geburten jedes Namens für die Gesamtzahl der Geburten pro Requisite herauszufinden'''
#Datenjahr,Nach Geschlecht gruppieren und in eine neue Spalte einfügen





def add_prop(group):
	'''Fügen Sie der Gruppe eine Spalte hinzu, in der das Verhältnis der Anzahl der Geburten für jeden Namen zur Gesamtzahl der Geburten angegeben ist'''
	births = group.births.astype(float)   # Integer division floors
	group['prop'] = births / births.sum()   #Ergebnis zur Spalte namens prop hinzufügen
	return group

names = names.groupby(['year', 'sex']).apply(add_prop)

np.allclose(names.groupby(['year','sex']).prop.sum(),1)   #1, wenn alle Requisiten hinzugefügt wurden(100%)Bestätige das
	#Überprüfen Sie durch allclose Namen.groupby(['year','sex']).prop.sum()Überprüfen Sie, ob 1 und 1 gleich sind










#__Analyzing Naming Trends__________________________ 
def get_top1000(group):
	'''
Top1000 Namen für jedes Geschlecht/Auszug aus dem Jahr. Bilden Sie eine andere Gruppe
	<Altes Format>
	return group.sort_index(by='births', ascending=False)[:1000]
	'''
	return group.sort_values(by='births', ascending=False)[:1000]

grouped=names.groupby(['year','sex'])
top1000=grouped.apply(get_top1000)
'''
__top1000 Another Way__________________________ 
pieces = []
for year, group in names.groupby(['year', 'sex']):
	pieces.append(group.sort_index(by='births', ascending=False)[:1000])
top1000 = pd.concat(pieces, ignore_index=True)
'''

boys=top1000[top1000.sex=='M']
girls=top1000[top1000.sex=='F']

total_births=top1000.pivot_table('births',index='year',columns='name',aggfunc=sum)   #data munging(Datenkonvertierung)Fix mit Pivot-Tabelle zu tun
subset=total_births[['John', 'Harry', 'Mary', 'Marilyn']]
subset.plot(subplots=True, figsize=(12,10),grid=False, title="Number of births per year")








# __Measuring the increase in naming diversity__________________________
table=top1000.pivot_table('prop',index='year',columns='sex',aggfunc=sum)   #Drehen Sie das Verhältnis, um die Vielfalt der Namen zu untersuchen_Versuche einen Tisch zu machen
table.plot(title='Sum of table1000.prop by year and sex',yticks=np.linspace(0,1.2,13),xticks=range(1880,2020,10))

# __Sortieren Sie die Datenprozentsätze für 2010__________________________ 
df=boys[boys.year==2010]
prop_cumsum=df.sort_values(by='prop',ascending=False).prop.cumsum()   #Kumulative Summe kumulative Summe`cumsum`Verwenden Sie Verhältnis Prop=0.Holen Sie sich einen Namen, der 5 erreicht
prop_cumsum[:10]
prop_cumsum.searchsorted(0.5)   #Der Index zu diesem Zeitpunkt wird mithilfe der suchsortierten Funktion ermittelt, und in der sortierten Spalte wird 0 angezeigt..5 Cumsum einfügen

# __Ähnlich im Jahr 1990__________________________
df=boys[boys.year==1900]
in1900=df.sort_values(by='prop',ascending=False).prop.cumsum()
in1900.searchsorted(0.5)+1
prop_cumsum[:10]
prop_cumsum=df.sort_values(by='prop',ascending=False).prop.cumsum()

# __1990,Mach das, was du 2010 für alle Jahre getan hast__________________________
def get_quantile_count(group, q=0.5):
	'''
0 der sortierten und kumulierten Requisitensumme.Gibt einen Index von 5 zurück
	<Altes Format>
		group = group.sort_index(by='prop', ascending=False)
		return group.prop.cumsum().searchsorted(q) + 1
	'''
	group = group.sort_values(by='prop', ascending=False)
	return group.prop.cumsum().values.searchsorted(q) + 1   #return group.prop.cumsum().values.searchsorted(q) +Auf 1 Werte umschreiben Verhindert, dass die hinzugefügten Werte in einem Array angezeigt werden

diversity=top1000.groupby(['year','sex']).apply(get_quantile_count)
diversity=diversity.unstack('sex')   #Nach Geschlecht

diversity.head()
diversity.plot(title="Number of popular names in top 50%")   #Es stellt sich heraus, dass Frauen in allen Jahren mehr Namensvielfalt haben



# __The "Last letter" Revolution__________________________ 
get_last_letter=lambda x:x[-1]   #Extrahieren Sie nur die letzte Zeichenfolge aus den Namensspalten
last_letters=names.name.map(get_last_letter)   #Formel zum Extrahieren der letzten Zeichenfolge für alle Namen in Namen(get_last_letter)Anwenden
last_letters.name='last_letter'
table=names.pivot_table('births',index=last_letters,columns=['sex','year'],aggfunc=sum)

subtable=table.reindex(columns=[1910,1960,2010],level='year')
subtable.head()


subtable.sum()   #Gesamtzahl der Geburten
letter_prop=subtable/subtable.sum().astype(float)   #String-Verhältnis abrufen

fig,axes=plt.subplots(2,1,figsize=(10,8))   #Wird in einer zweistufigen Nebenhandlung angezeigt
letter_prop['M'].plot(kind='bar',rot=0,ax=axes[0],title='Male')
letter_prop['F'].plot(kind='bar',rot=0,ax=axes[1],title='Female',legend=False)

#Das Verhältnis der männlichen Namen, die mit n enden, ist extrem hoch
#Jährlicher Übergang des letzten unten'n'Schau dir den Namen an

letter_prop=table/table.sum().astype(float)   #last_Verhältnis zur Gesamtzahl jedes Buchstabens
dny_ts=letter_prop.ix[['d','n','y'],'M'].T   #d,n,Versuchen Sie, nur y zu zeichnen
																				#'M'Nur für Männer
																				# .Erstellen Sie mit T eine transponierte Matrix von Datenrahmen
# dny_ts.head()
# dny_ts.plot()
# plt.show()













#__Boy names that became girl names (and vice versa)__________________________ 
all_names=top1000.name.unique()   #Aufzählung aller Namen
mask=np.array(['lesl' in x.lower() for x in all_names])   #lower()Konvertierungsmethode in Kleinbuchstaben
															#'lesl'Gibt True zurück, wenn es die Zeichen enthält
#[In]# mask
#[Out]# array([False, False, False, ..., False, False, False], dtype=bool)




lesley_like=all_names[mask]   #all_In Namen'lesl'Nur diejenigen, die Lesley enthalten_In wie speichern
#[In]# lesley_like
#[Out]# array(['Leslie', 'Lesley', 'Leslee', 'Lesli', 'Lesly'], dtype=object)





filtered=top1000[top1000.name.isin(lesley_like)]   #Extrahieren Sie nur lesley-ähnliche Typen aus top1000
#[In]# filtered
#[Out]#                     name sex  births  year      prop
#[Out]# year sex                                            
#[Out]# 1880 F   654      Leslie   F       8  1880  0.000088
#[Out]#      M   1108     Leslie   M      79  1880  0.000715
#[Out]# 1881 F   2523     Leslie   F      11  1881  0.000120
#[Out]#      M   3072     Leslie   M      92  1881  0.000913
#[Out]# 1882 F   4593     Leslie   F       9  1882  0.000083
#[Out]#      M   5081     Leslie   M     122  1882  0.001073
#[Out]#          5865     Lesley   M       6  1882  0.000053
#[Out]# 1883 F   6850     Leslie   F       7  1883  0.000062
#[Out]#      M   7225     Leslie   M     120  1883  0.001147
#[Out]#          8093     Lesley   M       5  1883  0.000048
#[Out]# 1884 F   8697     Leslie   F      15  1884  0.000116
#[Out]#      M   9432     Leslie   M     125  1884  0.001092
#[Out]# 1885 F   11161    Leslie   F      10  1885  0.000075
#[Out]#      M   11751    Leslie   M     122  1885  0.001132
#[Out]# 1886 F   13601    Leslie   F       8  1886  0.000055
#[Out]#      M   14132    Leslie   M     136  1886  0.001228
#[Out]# 1887 F   15806    Leslie   F      12  1887  0.000082
#[Out]#      M   16524    Leslie   M     166  1887  0.001637
#[Out]# 1888 F   18030    Leslie   F      23  1888  0.000129
#[Out]#      M   19074    Leslie   M     175  1888  0.001448
#[Out]# 1889 F   20690    Leslie   F      23  1889  0.000129
#[Out]#      M   21737    Leslie   M     155  1889  0.001402
#[Out]# 1890 F   23332    Leslie   F      20  1890  0.000105
#[Out]#      M   24372    Leslie   M     181  1890  0.001630
#[Out]# 1891 F   25928    Leslie   F      28  1891  0.000151
#[Out]#      M   27068    Leslie   M     164  1891  0.001621
#[Out]# 1892 F   28704    Leslie   F      22  1892  0.000104
#[Out]#      M   29851    Leslie   M     207  1892  0.001696
#[Out]# 1893 F   31576    Leslie   F      26  1893  0.000122
#[Out]#      M   32765    Leslie   M     185  1893  0.001647
#[Out]# ...                  ...  ..     ...   ...       ...
#[Out]# 2000 F   1332261  Leslie   F    3619  2000  0.001995
#[Out]#          1332560   Lesly   F     742  2000  0.000409
#[Out]#          1332601  Lesley   F     658  2000  0.000363
#[Out]# 2001 F   1362012  Leslie   F    3610  2001  0.002007
#[Out]#          1362300   Lesly   F     801  2001  0.000445
#[Out]#          1362452  Lesley   F     509  2001  0.000283
#[Out]# 2002 F   1392272  Leslie   F    3520  2002  0.001962
#[Out]#          1392586   Lesly   F     717  2002  0.000400
#[Out]#          1392743  Lesley   F     471  2002  0.000262
#[Out]# 2003 F   1422818  Leslie   F    3635  2003  0.001992
#[Out]#          1423091   Lesly   F     838  2003  0.000459
#[Out]#          1423330  Lesley   F     451  2003  0.000247
#[Out]# 2004 F   1453982  Leslie   F    3497  2004  0.001908
#[Out]#          1454295   Lesly   F     747  2004  0.000408
#[Out]#          1454500  Lesley   F     450  2004  0.000245
#[Out]# 2005 F   1486010  Leslie   F    3120  2005  0.001692
#[Out]#          1486308   Lesly   F     783  2005  0.000425
#[Out]#          1486623  Lesley   F     381  2005  0.000207
#[Out]# 2006 F   1518523  Leslie   F    3035  2006  0.001600
#[Out]#          1518834   Lesly   F     761  2006  0.000401
#[Out]#          1519161  Lesley   F     370  2006  0.000195
#[Out]# 2007 F   1552581  Leslie   F    2689  2007  0.001403
#[Out]#          1552882   Lesly   F     765  2007  0.000399
#[Out]#          1553271  Lesley   F     351  2007  0.000183
#[Out]# 2008 F   1587484  Leslie   F    2323  2008  0.001233
#[Out]#          1587788   Lesly   F     699  2008  0.000371
#[Out]# 2009 F   1622503  Leslie   F    1975  2009  0.001081
#[Out]#          1622845   Lesly   F     598  2009  0.000327
#[Out]# 2010 F   1657142  Leslie   F    1558  2010  0.000886
#[Out]#          1657525   Lesly   F     502  2010  0.000285
#[Out]# 
#[Out]# [400 rows x 5 columns]











filtered.groupby('name').births.sum()   #Gruppieren Sie nach Namen und summieren Sie die Anzahl der Geburten
#[Out]# name
#[Out]# Leslee      1082
#[Out]# Lesley     35022
#[Out]# Lesli        929
#[Out]# Leslie    370429
#[Out]# Lesly      10067
#[Out]# Name: births, dtype: int64







table=filtered.pivot_table('births',index='year',columns='sex',aggfunc=sum)   # sex,Nach Jahr aggregieren
#[Out]# sex        F      M
#[Out]# year               
#[Out]# 1880     8.0   79.0
#[Out]# 1881    11.0   92.0
#[Out]# 1882     9.0  128.0
#[Out]# 1883     7.0  125.0
#[Out]# 1884    15.0  125.0
#[Out]# 1885    10.0  122.0
#[Out]# 1886     8.0  136.0
#[Out]# 1887    12.0  166.0
#[Out]# 1888    23.0  175.0
#[Out]# 1889    23.0  155.0
#[Out]# 1890    20.0  181.0
#[Out]# 1891    28.0  164.0
#[Out]# 1892    22.0  207.0
#[Out]# 1893    26.0  185.0
#[Out]# 1894    36.0  223.0
#[Out]# 1895    22.0  235.0
#[Out]# 1896    27.0  237.0
#[Out]# 1897    34.0  222.0
#[Out]# 1898    24.0  236.0
#[Out]# 1899    18.0  181.0
#[Out]# 1900    30.0  285.0
#[Out]# 1901    29.0  204.0
#[Out]# 1902    37.0  251.0
#[Out]# 1903    24.0  244.0
#[Out]# 1904    30.0  243.0
#[Out]# 1905    35.0  247.0
#[Out]# 1906    29.0  263.0
#[Out]# 1907    34.0  273.0
#[Out]# 1908    41.0  290.0
#[Out]# 1909    35.0  292.0
#[Out]# ...      ...    ...
#[Out]# 1981  5796.0  500.0
#[Out]# 1982  5814.0  430.0
#[Out]# 1983  4975.0  414.0
#[Out]# 1984  4419.0  367.0
#[Out]# 1985  4168.0  331.0
#[Out]# 1986  3741.0  379.0
#[Out]# 1987  3666.0  290.0
#[Out]# 1988  3555.0  318.0
#[Out]# 1989  3259.0  327.0
#[Out]# 1990  3268.0  295.0
#[Out]# 1991  2920.0  277.0
#[Out]# 1992  2836.0  216.0
#[Out]# 1993  2607.0  201.0
#[Out]# 1994  2685.0  207.0
#[Out]# 1995  2782.0  186.0
#[Out]# 1996  3584.0  176.0
#[Out]# 1997  3847.0  158.0
#[Out]# 1998  4289.0    NaN
#[Out]# 1999  4693.0    NaN
#[Out]# 2000  5019.0    NaN
#[Out]# 2001  4920.0    NaN
#[Out]# 2002  4708.0    NaN
#[Out]# 2003  4924.0    NaN
#[Out]# 2004  4694.0    NaN
#[Out]# 2005  4284.0    NaN
#[Out]# 2006  4166.0    NaN
#[Out]# 2007  3805.0    NaN
#[Out]# 2008  3022.0    NaN
#[Out]# 2009  2573.0    NaN
#[Out]# 2010  2060.0    NaN
#[Out]# 
#[Out]# [131 rows x 2 columns]



table=table.div(table.sum(1),axis=0)   #Normalisierung
# table.tail()
#[Out]# sex     F   M
#[Out]# year         
#[Out]# 2006  1.0 NaN
#[Out]# 2007  1.0 NaN
#[Out]# 2008  1.0 NaN
#[Out]# 2009  1.0 NaN
#[Out]# 2010  1.0 NaN
# table.head()
#[Out]# sex          F         M
#[Out]# year                    
#[Out]# 1880  0.091954  0.908046
#[Out]# 1881  0.106796  0.893204
#[Out]# 1882  0.065693  0.934307
#[Out]# 1883  0.053030  0.946970
#[Out]# 1884  0.107143  0.892857



table.plot(style={'M':'k-','F':'k--'})
#[Out]# <matplotlib.axes._subplots.AxesSubplot at 0x227b78e6400>