Die 2020-Version von 100 Sprachverarbeitungsklopfen wurde veröffentlicht.
In diesem Artikel werde ich die Antworten auf einfache Weise veröffentlichen, um sie leichter nachschlagen zu können. Die Erklärung ist in einem anderen Artikel geschrieben.
Sprachverarbeitung 100 Knock 2020 [Kapitel 5: Antwort auf Abhängigkeitsanalyse]
s = 'stressed'
print (s[::-1])
s = 'Patatoku Kashii'
print (s[::2])
s1 = 'Pat Auto'
s2 = 'Taxi'
print (''.join([a+b for a,b in zip(s1,s2)]))
s = "Now I need a drink, alcoholic of course, after the heavy lectures involving quantum mechanics."
s = s.replace(',','').replace('.','')
[len(w) for w in s.split()]
s = "Hi He Lied Because Boron Could Not Oxidize Fluorine. New Nations Might Also Sign Peace Security Clause. Arthur King Can."
s = s.replace('.','')
idx = [1, 5, 6, 7, 8, 9, 15, 16, 19]
mp = {}
for i,w in enumerate(s.split()):
if (i+1) in idx:
v = w[:1]
v = w[:2]
mp[v] = i+1
print (mp)
def ngram(S, n):
r = []
for i in range(len(S) - n + 1):
return r
s = 'I am an NLPer'
print (ngram(s.split(),2))
print (ngram(s,2))
def ngram(S, n):
r = []
for i in range(len(S) - n + 1):
return r
s1 = 'paraparaparadise'
s2 = 'paragraph'
st1 = set(ngram(s1, 2))
st2 = set(ngram(s2, 2))
print(st1 | st2)
print(st1 & st2)
print(st1 - st2)
print('se' in st1)
print('se' in st2)
def temperature(x,y,z):
return str(x)+'von Zeit'+str(y)+'Ist'+str(z)
x = 12
y = 'Temperatur'
z = 22.4
print (temperature(x,y,z))
def cipher(S):
new = []
for s in S:
if 97 <= ord(s) <= 122:
s = chr(219 - ord(s))
return ''.join(new)
s = 'I am an NLPer'
new = cipher(s)
print (new)
print (cipher(new))
import random
s = 'I couldn’t believe that I could actually understand what I was reading : the phenomenal power of the human mind .'
ans = []
text = s.split()
for word in text:
if (len(word)>4):
mid = list(word[1:-1])
word = word[0] + ''.join(mid) + word[-1]
print (' '.join(ans))
https://kakedashi-engineer.appspot.com/2020/04/16/nlp100-10-14/ https://kakedashi-engineer.appspot.com/2020/04/17/nlp100-15-19/
import pandas as pd
df = pd.read_csv('popular-names.txt', delimiter='\t', header=None)
print (len(df))
wc popular-names.txt
import pandas as pd
df = pd.read_csv('popular-names.txt', delimiter='\t', header=None)
df.to_csv('space.txt', sep=' ',header=False, index=False)
sed 's/\t/ /g' popular-names.txt > replaced.txt
import pandas as pd
df = pd.read_csv('popular-names.txt', delimiter='\t', header=None)
df.iloc[:,0].to_csv('col1.txt', sep=' ',header=False, index=False)
df.iloc[:,1].to_csv('col2.txt', sep=' ',header=False, index=False)
cut -f 1 popular-names.txt > col1.txt
cut -f 2 popular-names.txt > col2.txt
import pandas as pd
df1 = pd.read_csv('col1.txt', delimiter='\t', header=None)
df2 = pd.read_csv('col2.txt', delimiter='\t', header=None)
df = pd.concat([df1, df2], axis=1)
df.to_csv('col1_2.txt', sep='\t',header=False, index=False)
paste col1.txt col2.txt > col1_2.txt
import pandas as pd
df = pd.read_csv('popular-names.txt', delimiter='\t', header=None)
print (df.head(5))
head -n 5 popu
import pandas as pd
df = pd.read_csv('popular-names.txt', delimiter='\t', header=None)
print (df.tail(5))
tail -n 5 popular-names.txt
N = 3
import pandas as pd
df = pd.read_csv('popular-names.txt', delimiter='\t', header=None)
step = - (-len(df) // N)
for n in range(N):
df_split = df.iloc[n*step:(n+1)*step]
df_split.to_csv('popular-names'+str(n)+'.txt', sep='\t',header=False, index=False)
split -n 3 popuar-names.txt
import pandas as pd
df = pd.read_csv('popular-names.txt', delimiter='\t', header=None)
new = df[0].unique()
print (new)
cut -f 1 popular-names.txt | sort | uniq
import pandas as pd
df = pd.read_csv('popular-names.txt', delimiter='\t', header=None)
new = df[2].sort_values(ascending=False)
print (new)
cut -f 3 popular-names.txt | sort -n -r
import pandas as pd
df = pd.read_csv('popular-names.txt', delimiter='\t', header=None)
vc = df[0].value_counts()
vc = pd.DataFrame(vc)
vc = vc.reset_index()
vc.columns = ['name','count']
vc = vc.sort_values(['count','name'],ascending=[False,False])
print (vc)
cut -f 1 popular-names.txt | sort | uniq -c | sort -n -r
https://kakedashi-engineer.appspot.com/2020/04/18/nlp100-20-24/ https://kakedashi-engineer.appspot.com/2020/04/19/nlp100-25-26/ https://kakedashi-engineer.appspot.com/2020/04/20/nlp100-27-28/ https://kakedashi-engineer.appspot.com/2020/04/21/nlp100-29-30/
import pandas as pd
wiki = pd.read_json('jawiki-country.json.gz', lines = True)
uk = wiki[wiki['title']=='England'].text.values
print (uk)
import pandas as pd
import re
pattern = re.compile('Category')
wiki = pd.read_json('jawiki-country.json.gz', lines = True)
uk = wiki[wiki['title']=='England'].text.values
ls = uk[0].split('\n')
for line in ls:
if re.search(pattern, line):
print (line)
import pandas as pd
import re
wiki = pd.read_json('jawiki-country.json.gz', lines = True)
uk = wiki[wiki['title']=='England'].text.values
ls = uk[0].split('\n')
for line in ls:
if re.search(pattern, line):
line = line.replace('[[','').replace('Category:','').replace(']]','').replace('|*','').replace('|Ehemalige','')
print (line)
import pandas as pd
import re
pattern = re.compile('^=+.*=+$') #Mehr als einmal=Beginnen Sie mit mehr als einmal=Zeichenfolge, die mit endet
wiki = pd.read_json('jawiki-country.json.gz', lines = True)
uk = wiki[wiki['title']=='England'].text.values
ls = uk[0].split('\n')
for line in ls:
if re.search(pattern, line):
level = line.count('=') // 2 - 1
print(line.replace('=',''), level )
import pandas as pd
import re
pattern = re.compile('File|Datei:(.+?)\|')
wiki = pd.read_json('jawiki-country.json.gz', lines = True)
uk = wiki[wiki['title']=='England'].text.values
ls = uk[0].split('\n')
for line in ls:
r = re.findall(pattern, line)
if r:
print (r[0])
import pandas as pd
import re
pattern = re.compile('\|(.+?)\s=\s*(.+)')
wiki = pd.read_json('jawiki-country.json.gz', lines = True)
uk = wiki[wiki['title']=='England'].text.values
ls = uk[0].split('\n')
d = {}
for line in ls:
r = re.search(pattern, line)
if r:
print (d)
import pandas as pd
import re
pattern = re.compile('\|(.+?)\s=\s*(.+)')
p_emp = re.compile('\'{2,}(.+?)\'{2,}')
wiki = pd.read_json('jawiki-country.json.gz', lines = True)
uk = wiki[wiki['title']=='England'].text.values
ls = uk[0].split('\n')
d = {}
for line in ls:
r = re.search(pattern, line)
if r:
r = re.sub(p_emp,'\\1', line)
print (r)
print (d)
import pandas as pd
import re
pattern = re.compile('\|(.+?)\s=\s*(.+)')
p_emp = re.compile('\'{2,}(.+?)\'{2,}')
p_link = re.compile('\[\[(.+?)\]\]')
wiki = pd.read_json('jawiki-country.json.gz', lines = True)
uk = wiki[wiki['title']=='England'].text.values
lines = uk[0]
lines = re.sub(p_emp,'\\1', lines)
lines = re.sub(p_link,'\\1', lines)
ls = lines.split('\n')
d = {}
for line in ls:
r = re.search(pattern, line)
if r:
print (d)
import pandas as pd
import re
pattern = re.compile('\|(.+?)\s=\s*(.+)')
p_emp = re.compile('\'{2,}(.+?)\'{2,}')
p_link = re.compile('\[\[(.+?)\]\]')
p_refbr = re.compile('<[br|ref][^>]*?>.+?<\/[br|ref][^>]*?>')
wiki = pd.read_json('jawiki-country.json.gz', lines = True)
uk = wiki[wiki['title']=='England'].text.values
lines = uk[0]
lines = re.sub(p_emp,'\\1', lines)
lines = re.sub(p_link,'\\1', lines)
lines = re.sub(p_refbr,'', lines)
ls = lines.split('\n')
d = {}
for line in ls:
r = re.search(pattern, line)
if r:
print (d)
import pandas as pd
import re
import requests
pattern = re.compile('\|(.+?)\s=\s*(.+)')
wiki = pd.read_json('jawiki-country.json.gz', lines = True)
uk = wiki[wiki['title']=='England'].text.values
ls = uk[0].split('\n')
d = {}
for line in ls:
r = re.search(pattern, line)
if r:
S = requests.Session()
URL = "https://commons.wikimedia.org/w/api.php"
"action": "query",
"format": "json",
"titles": "File:" + d['Flaggenbild'],
"prop": "imageinfo",
R = S.get(url=URL, params=PARAMS)
DATA = R.json()
PAGES = DATA['query']['pages']
for k, v in PAGES.items():
print (v['imageinfo'][0]['url'])
https://kakedashi-engineer.appspot.com/2020/04/22/nlp100-31-34/ https://kakedashi-engineer.appspot.com/2020/04/22/nlp100-35-39/
import MeCab
path = 'neko.txt.mecab'
with open(path) as f:
text = f.read().split('\n')
result = []
for line in text:
if line == 'EOS':
ls = line.split('\t')
d = {}
tmp = ls[1].split(',')
d = {'surface':ls[0], 'base':tmp[6], 'pos':tmp[0], 'pos1':tmp[1]}
import MeCab
path = 'neko.txt.mecab'
with open(path) as f:
text = f.read().split('\n')
result = []
for line in text[:-1]:
if line == 'EOS':
ls = line.split('\t')
d = {}
tmp = ls[1].split(',')
d = {'surface':ls[0], 'base':tmp[6], 'pos':tmp[0], 'pos1':tmp[1]}
[d['surface'] for d in result if d['pos'] == 'Verb' ]
import MeCab
path = 'neko.txt.mecab'
with open(path) as f:
text = f.read().split('\n')
result = []
for line in text[:-1]:
if line == 'EOS':
ls = line.split('\t')
d = {}
tmp = ls[1].split(',')
d = {'surface':ls[0], 'base':tmp[6], 'pos':tmp[0], 'pos1':tmp[1]}
[d['base'] for d in result if d['pos'] == 'Verb' ]
import MeCab
path = 'neko.txt.mecab'
with open(path) as f:
text = f.read().split('\n')
result = []
for line in text[:-1]:
if line == 'EOS':
ls = line.split('\t')
d = {}
tmp = ls[1].split(',')
d = {'surface':ls[0], 'base':tmp[6], 'pos':tmp[0], 'pos1':tmp[1]}
noun_phrase = []
for i in range(len(result)-2):
if (result[i]['pos'] == 'Substantiv' and result[i+1]['surface'] == 'von' and result[i+2]['pos'] == 'Substantiv'):
print (noun_phrase)
import MeCab
path = 'neko.txt.mecab'
with open(path) as f:
text = f.read().split('\n')
result = []
for line in text[:-1]:
if line == 'EOS':
ls = line.split('\t')
d = {}
tmp = ls[1].split(',')
d = {'surface':ls[0], 'base':tmp[6], 'pos':tmp[0], 'pos1':tmp[1]}
ls_noun = []
noun = ''
for d in result:
if d['pos']=='Substantiv':
noun += d['surface']
if noun != '':
noun = ''
if noun != '':
noun = ''
print (ls_noun)
import MeCab
from collections import Counter
path = 'neko.txt.mecab'
with open(path) as f:
text = f.read().split('\n')
result = []
for line in text[:-1]:
if line == 'EOS':
ls = line.split('\t')
d = {}
tmp = ls[1].split(',')
d = {'surface':ls[0], 'base':tmp[6], 'pos':tmp[0], 'pos1':tmp[1]}
surface = [d['surface'] for d in result]
c = Counter(surface)
print (c.most_common())
import MeCab
from collections import Counter
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'AppleGothic'
path = 'neko.txt.mecab'
with open(path) as f:
text = f.read().split('\n')
result = []
for line in text[:-1]:
if line == 'EOS':
ls = line.split('\t')
d = {}
tmp = ls[1].split(',')
d = {'surface':ls[0], 'base':tmp[6], 'pos':tmp[0], 'pos1':tmp[1]}
surface = [d['surface'] for d in result]
c = Counter(surface)
target = list(zip(*c.most_common(10)))
import MeCab
from collections import Counter
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'AppleGothic'
path = 'neko.txt.mecab'
with open(path) as f:
text = f.read().split('\n')
result = []
tmp_cooccurrence = []
cooccurrence = []
inCat = False
for line in text[:-1]:
if line == 'EOS':
if inCat:
tmp_cooccurrence = []
inCat = False
ls = line.split('\t')
d = {}
tmp = ls[1].split(',')
d = {'surface':ls[0], 'base':tmp[6], 'pos':tmp[0], 'pos1':tmp[1]}
if ls[0]!='Katze':
inCat = True
c = Counter(cooccurrence)
target = list(zip(*c.most_common(10)))
import MeCab
from collections import Counter
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'AppleGothic'
path = 'neko.txt.mecab'
with open(path) as f:
text = f.read().split('\n')
result = []
for line in text[:-1]:
if line == 'EOS':
ls = line.split('\t')
d = {}
tmp = ls[1].split(',')
d = {'surface':ls[0], 'base':tmp[6], 'pos':tmp[0], 'pos1':tmp[1]}
surface = [d['surface'] for d in result]
c = Counter(surface)
plt.hist(c.values(), range = (1,10))
import MeCab
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
plt.rcParams['font.family'] = 'AppleGothic'
path = 'neko.txt.mecab'
with open(path) as f:
text = f.read().split('\n')
result = []
for line in text[:-1]:
if line == 'EOS':
ls = line.split('\t')
d = {}
tmp = ls[1].split(',')
d = {'surface':ls[0], 'base':tmp[6], 'pos':tmp[0], 'pos1':tmp[1]}
surface = [d['surface'] for d in result]
c = Counter(surface)
v = [kv[1] for kv in c.most_common()]
Sprachverarbeitung 100 Klopfen 2020 [00 ~ 49 Antwort]
Recommended Posts