The 2020 version of 100 language processing knocks has been released.
https://nlp100.github.io/ja/
In this article, I will post the answers in a straightforward manner for easy reference. The explanation is written in another article.
Language processing 100 knocks 2020 [Chapter 5: Dependency analysis answer]
https://kakedashi-engineer.appspot.com/2020/04/15/nlp100-00-09/
s = 'stressed'
print (s[::-1])
s = 'Patatoku Kashii'
print (s[::2])
s1 = 'Police car'
s2 = 'taxi'
print (''.join([a+b for a,b in zip(s1,s2)]))
s = "Now I need a drink, alcoholic of course, after the heavy lectures involving quantum mechanics."
s = s.replace(',','').replace('.','')
[len(w) for w in s.split()]
s = "Hi He Lied Because Boron Could Not Oxidize Fluorine. New Nations Might Also Sign Peace Security Clause. Arthur King Can."
s = s.replace('.','')
idx = [1, 5, 6, 7, 8, 9, 15, 16, 19]
mp = {}
for i,w in enumerate(s.split()):
if (i+1) in idx:
v = w[:1]
else:
v = w[:2]
mp[v] = i+1
print (mp)
def ngram(S, n):
r = []
for i in range(len(S) - n + 1):
r.append(S[i:i+n])
return r
s = 'I am an NLPer'
print (ngram(s.split(),2))
print (ngram(s,2))
def ngram(S, n):
r = []
for i in range(len(S) - n + 1):
r.append(S[i:i+n])
return r
s1 = 'paraparaparadise'
s2 = 'paragraph'
st1 = set(ngram(s1, 2))
st2 = set(ngram(s2, 2))
print(st1 | st2)
print(st1 & st2)
print(st1 - st2)
print('se' in st1)
print('se' in st2)
def temperature(x,y,z):
return str(x)+'of time'+str(y)+'Is'+str(z)
x = 12
y = 'temperature'
z = 22.4
print (temperature(x,y,z))
def cipher(S):
new = []
for s in S:
if 97 <= ord(s) <= 122:
s = chr(219 - ord(s))
new.append(s)
return ''.join(new)
s = 'I am an NLPer'
new = cipher(s)
print (new)
print (cipher(new))
import random
s = 'I couldn’t believe that I could actually understand what I was reading : the phenomenal power of the human mind .'
ans = []
text = s.split()
for word in text:
if (len(word)>4):
mid = list(word[1:-1])
random.shuffle(mid)
word = word[0] + ''.join(mid) + word[-1]
ans.append(word)
else:
ans.append(word)
print (' '.join(ans))
https://kakedashi-engineer.appspot.com/2020/04/16/nlp100-10-14/ https://kakedashi-engineer.appspot.com/2020/04/17/nlp100-15-19/
import pandas as pd
df = pd.read_csv('popular-names.txt', delimiter='\t', header=None)
print (len(df))
wc popular-names.txt
import pandas as pd
df = pd.read_csv('popular-names.txt', delimiter='\t', header=None)
df.to_csv('space.txt', sep=' ',header=False, index=False)
sed 's/\t/ /g' popular-names.txt > replaced.txt
import pandas as pd
df = pd.read_csv('popular-names.txt', delimiter='\t', header=None)
df.iloc[:,0].to_csv('col1.txt', sep=' ',header=False, index=False)
df.iloc[:,1].to_csv('col2.txt', sep=' ',header=False, index=False)
cut -f 1 popular-names.txt > col1.txt
cut -f 2 popular-names.txt > col2.txt
import pandas as pd
df1 = pd.read_csv('col1.txt', delimiter='\t', header=None)
df2 = pd.read_csv('col2.txt', delimiter='\t', header=None)
df = pd.concat([df1, df2], axis=1)
df.to_csv('col1_2.txt', sep='\t',header=False, index=False)
paste col1.txt col2.txt > col1_2.txt
import pandas as pd
df = pd.read_csv('popular-names.txt', delimiter='\t', header=None)
print (df.head(5))
head -n 5 popu
import pandas as pd
df = pd.read_csv('popular-names.txt', delimiter='\t', header=None)
print (df.tail(5))
tail -n 5 popular-names.txt
N = 3
import pandas as pd
df = pd.read_csv('popular-names.txt', delimiter='\t', header=None)
step = - (-len(df) // N)
for n in range(N):
df_split = df.iloc[n*step:(n+1)*step]
df_split.to_csv('popular-names'+str(n)+'.txt', sep='\t',header=False, index=False)
split -n 3 popuar-names.txt
import pandas as pd
df = pd.read_csv('popular-names.txt', delimiter='\t', header=None)
new = df[0].unique()
new.sort()
print (new)
cut -f 1 popular-names.txt | sort | uniq
import pandas as pd
df = pd.read_csv('popular-names.txt', delimiter='\t', header=None)
new = df[2].sort_values(ascending=False)
print (new)
cut -f 3 popular-names.txt | sort -n -r
import pandas as pd
df = pd.read_csv('popular-names.txt', delimiter='\t', header=None)
vc = df[0].value_counts()
vc = pd.DataFrame(vc)
vc = vc.reset_index()
vc.columns = ['name','count']
vc = vc.sort_values(['count','name'],ascending=[False,False])
print (vc)
cut -f 1 popular-names.txt | sort | uniq -c | sort -n -r
https://kakedashi-engineer.appspot.com/2020/04/18/nlp100-20-24/ https://kakedashi-engineer.appspot.com/2020/04/19/nlp100-25-26/ https://kakedashi-engineer.appspot.com/2020/04/20/nlp100-27-28/ https://kakedashi-engineer.appspot.com/2020/04/21/nlp100-29-30/
import pandas as pd
wiki = pd.read_json('jawiki-country.json.gz', lines = True)
uk = wiki[wiki['title']=='England'].text.values
print (uk)
import pandas as pd
import re
pattern = re.compile('Category')
wiki = pd.read_json('jawiki-country.json.gz', lines = True)
uk = wiki[wiki['title']=='England'].text.values
ls = uk[0].split('\n')
for line in ls:
if re.search(pattern, line):
print (line)
import pandas as pd
import re
wiki = pd.read_json('jawiki-country.json.gz', lines = True)
uk = wiki[wiki['title']=='England'].text.values
ls = uk[0].split('\n')
for line in ls:
if re.search(pattern, line):
line = line.replace('[[','').replace('Category:','').replace(']]','').replace('|*','').replace('|Former','')
print (line)
import pandas as pd
import re
pattern = re.compile('^=+.*=+$') #More than once=Starting with, more than once=String ending with
wiki = pd.read_json('jawiki-country.json.gz', lines = True)
uk = wiki[wiki['title']=='England'].text.values
ls = uk[0].split('\n')
for line in ls:
if re.search(pattern, line):
level = line.count('=') // 2 - 1
print(line.replace('=',''), level )
import pandas as pd
import re
pattern = re.compile('File|File:(.+?)\|')
wiki = pd.read_json('jawiki-country.json.gz', lines = True)
uk = wiki[wiki['title']=='England'].text.values
ls = uk[0].split('\n')
for line in ls:
r = re.findall(pattern, line)
if r:
print (r[0])
import pandas as pd
import re
pattern = re.compile('\|(.+?)\s=\s*(.+)')
wiki = pd.read_json('jawiki-country.json.gz', lines = True)
uk = wiki[wiki['title']=='England'].text.values
ls = uk[0].split('\n')
d = {}
for line in ls:
r = re.search(pattern, line)
if r:
d[r[1]]=r[2]
print (d)
import pandas as pd
import re
pattern = re.compile('\|(.+?)\s=\s*(.+)')
p_emp = re.compile('\'{2,}(.+?)\'{2,}')
wiki = pd.read_json('jawiki-country.json.gz', lines = True)
uk = wiki[wiki['title']=='England'].text.values
ls = uk[0].split('\n')
d = {}
for line in ls:
r = re.search(pattern, line)
if r:
d[r[1]]=r[2]
r = re.sub(p_emp,'\\1', line)
print (r)
print (d)
import pandas as pd
import re
pattern = re.compile('\|(.+?)\s=\s*(.+)')
p_emp = re.compile('\'{2,}(.+?)\'{2,}')
p_link = re.compile('\[\[(.+?)\]\]')
wiki = pd.read_json('jawiki-country.json.gz', lines = True)
uk = wiki[wiki['title']=='England'].text.values
lines = uk[0]
lines = re.sub(p_emp,'\\1', lines)
lines = re.sub(p_link,'\\1', lines)
ls = lines.split('\n')
d = {}
for line in ls:
r = re.search(pattern, line)
if r:
d[r[1]]=r[2]
print (d)
import pandas as pd
import re
pattern = re.compile('\|(.+?)\s=\s*(.+)')
p_emp = re.compile('\'{2,}(.+?)\'{2,}')
p_link = re.compile('\[\[(.+?)\]\]')
p_refbr = re.compile('<[br|ref][^>]*?>.+?<\/[br|ref][^>]*?>')
wiki = pd.read_json('jawiki-country.json.gz', lines = True)
uk = wiki[wiki['title']=='England'].text.values
lines = uk[0]
lines = re.sub(p_emp,'\\1', lines)
lines = re.sub(p_link,'\\1', lines)
lines = re.sub(p_refbr,'', lines)
ls = lines.split('\n')
d = {}
for line in ls:
r = re.search(pattern, line)
if r:
d[r[1]]=r[2]
print (d)
import pandas as pd
import re
import requests
pattern = re.compile('\|(.+?)\s=\s*(.+)')
wiki = pd.read_json('jawiki-country.json.gz', lines = True)
uk = wiki[wiki['title']=='England'].text.values
ls = uk[0].split('\n')
d = {}
for line in ls:
r = re.search(pattern, line)
if r:
d[r[1]]=r[2]
S = requests.Session()
URL = "https://commons.wikimedia.org/w/api.php"
PARAMS = {
"action": "query",
"format": "json",
"titles": "File:" + d['National flag image'],
"prop": "imageinfo",
"iiprop":"url"
}
R = S.get(url=URL, params=PARAMS)
DATA = R.json()
PAGES = DATA['query']['pages']
for k, v in PAGES.items():
print (v['imageinfo'][0]['url'])
https://kakedashi-engineer.appspot.com/2020/04/22/nlp100-31-34/ https://kakedashi-engineer.appspot.com/2020/04/22/nlp100-35-39/
import MeCab
path = 'neko.txt.mecab'
with open(path) as f:
text = f.read().split('\n')
result = []
for line in text:
if line == 'EOS':
continue
ls = line.split('\t')
d = {}
tmp = ls[1].split(',')
d = {'surface':ls[0], 'base':tmp[6], 'pos':tmp[0], 'pos1':tmp[1]}
result.append(d)
import MeCab
path = 'neko.txt.mecab'
with open(path) as f:
text = f.read().split('\n')
result = []
for line in text[:-1]:
if line == 'EOS':
continue
ls = line.split('\t')
d = {}
tmp = ls[1].split(',')
d = {'surface':ls[0], 'base':tmp[6], 'pos':tmp[0], 'pos1':tmp[1]}
result.append(d)
[d['surface'] for d in result if d['pos'] == 'verb' ]
import MeCab
path = 'neko.txt.mecab'
with open(path) as f:
text = f.read().split('\n')
result = []
for line in text[:-1]:
if line == 'EOS':
continue
ls = line.split('\t')
d = {}
tmp = ls[1].split(',')
d = {'surface':ls[0], 'base':tmp[6], 'pos':tmp[0], 'pos1':tmp[1]}
result.append(d)
[d['base'] for d in result if d['pos'] == 'verb' ]
import MeCab
path = 'neko.txt.mecab'
with open(path) as f:
text = f.read().split('\n')
result = []
for line in text[:-1]:
if line == 'EOS':
continue
ls = line.split('\t')
d = {}
tmp = ls[1].split(',')
d = {'surface':ls[0], 'base':tmp[6], 'pos':tmp[0], 'pos1':tmp[1]}
result.append(d)
noun_phrase = []
for i in range(len(result)-2):
if (result[i]['pos'] == 'noun' and result[i+1]['surface'] == 'of' and result[i+2]['pos'] == 'noun'):
noun_phrase.append(result[i]['surface']+result[i+1]['surface']+result[i+2]['surface'])
print (noun_phrase)
import MeCab
path = 'neko.txt.mecab'
with open(path) as f:
text = f.read().split('\n')
result = []
for line in text[:-1]:
if line == 'EOS':
continue
ls = line.split('\t')
d = {}
tmp = ls[1].split(',')
d = {'surface':ls[0], 'base':tmp[6], 'pos':tmp[0], 'pos1':tmp[1]}
result.append(d)
ls_noun = []
noun = ''
for d in result:
if d['pos']=='noun':
noun += d['surface']
else:
if noun != '':
ls_noun.append(noun)
noun = ''
else:
if noun != '':
ls_noun.append(noun)
noun = ''
print (ls_noun)
import MeCab
from collections import Counter
path = 'neko.txt.mecab'
with open(path) as f:
text = f.read().split('\n')
result = []
for line in text[:-1]:
if line == 'EOS':
continue
ls = line.split('\t')
d = {}
tmp = ls[1].split(',')
d = {'surface':ls[0], 'base':tmp[6], 'pos':tmp[0], 'pos1':tmp[1]}
result.append(d)
surface = [d['surface'] for d in result]
c = Counter(surface)
print (c.most_common())
import MeCab
from collections import Counter
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'AppleGothic'
path = 'neko.txt.mecab'
with open(path) as f:
text = f.read().split('\n')
result = []
for line in text[:-1]:
if line == 'EOS':
continue
ls = line.split('\t')
d = {}
tmp = ls[1].split(',')
d = {'surface':ls[0], 'base':tmp[6], 'pos':tmp[0], 'pos1':tmp[1]}
result.append(d)
surface = [d['surface'] for d in result]
c = Counter(surface)
target = list(zip(*c.most_common(10)))
plt.bar(*target)
plt.show()
import MeCab
from collections import Counter
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'AppleGothic'
path = 'neko.txt.mecab'
with open(path) as f:
text = f.read().split('\n')
result = []
tmp_cooccurrence = []
cooccurrence = []
inCat = False
for line in text[:-1]:
if line == 'EOS':
if inCat:
cooccurrence.extend(tmp_cooccurrence)
else:
pass
tmp_cooccurrence = []
inCat = False
continue
ls = line.split('\t')
d = {}
tmp = ls[1].split(',')
d = {'surface':ls[0], 'base':tmp[6], 'pos':tmp[0], 'pos1':tmp[1]}
result.append(d)
if ls[0]!='Cat':
tmp_cooccurrence.append(ls[0])
else:
inCat = True
c = Counter(cooccurrence)
target = list(zip(*c.most_common(10)))
plt.bar(*target)
plt.show()
import MeCab
from collections import Counter
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'AppleGothic'
path = 'neko.txt.mecab'
with open(path) as f:
text = f.read().split('\n')
result = []
for line in text[:-1]:
if line == 'EOS':
continue
ls = line.split('\t')
d = {}
tmp = ls[1].split(',')
d = {'surface':ls[0], 'base':tmp[6], 'pos':tmp[0], 'pos1':tmp[1]}
result.append(d)
surface = [d['surface'] for d in result]
c = Counter(surface)
plt.hist(c.values(), range = (1,10))
plt.show()
import MeCab
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
plt.rcParams['font.family'] = 'AppleGothic'
path = 'neko.txt.mecab'
with open(path) as f:
text = f.read().split('\n')
result = []
for line in text[:-1]:
if line == 'EOS':
continue
ls = line.split('\t')
d = {}
tmp = ls[1].split(',')
d = {'surface':ls[0], 'base':tmp[6], 'pos':tmp[0], 'pos1':tmp[1]}
result.append(d)
surface = [d['surface'] for d in result]
c = Counter(surface)
v = [kv[1] for kv in c.most_common()]
plt.scatter(np.log(range(len(v))),np.log(v))
plt.show()
Language processing 100 knocks 2020 [00 ~ 49 answer]
Recommended Posts