Recently, it was necessary to summarize the voice converted into text and import it as data, and for consideration, I ran the automatic sentence summarization API published by Recruit Technologies with python3. Originally it works with python2.7, so I think that many people think that it should work with python3 and share it. GitHub of Recruit Technologies
This time, the algorithm converts only lexrank and confirms only access on the Web. The annotations have been removed.
OS | windows10Home | |
python | Winpython64-3.7.7.1.exe | |
Mecab | MeCab 0.996 64bit version | https://github.com/ikegami-yukino/mecab/releases/tag/v0.996 |
python additional module | CherryPy==18.6.0、mecab==0.996.2 |
summpy-master ├─server.bat └─summpy ├─lexrank.py ├─server.py ├─tools.py ├─misc │ ├─divrank.py │ └─mecab_segmenter.py └─server_data └─test.html
server.py
#!/usr/bin/env python
# coding: utf-8
import sys
import os
import re
import getopt
import cherrypy
import json
path = os.getcwd() #Win compatible
sys.path.append(path) #Win compatible
from summpy import tools #Win compatible
class Summarizer(object):
def __init__(self):
self.summarizers = {}
def get_summarizer(self, name):
if name in self.summarizers:
pass
elif name == 'lexrank':
from summpy import lexrank
self.summarizers[name] = lexrank.summarize
elif name == 'mcp':
from summpy import mcp_summ
self.summarizers[name] = mcp_summ.summarize
return self.summarizers[name]
@cherrypy.expose
def summarize(self, text=None, algo='lexrank', **summarizer_params):
try: # TODO: generate more useful error message
# fix parameter type
for param, value in list(summarizer_params.items()):
if value == '':
del summarizer_params[param]
continue
elif re.match(r'^\d*.\d+$', value):
value = float(value)
elif re.match(r'^\d+$', value):
value = int(value)
elif value == 'true':
value = True
elif value == 'false':
value = False
summarizer_params[param] = value
if algo in ('lexrank', 'clexrank', 'divrank'):
summarizer = self.get_summarizer('lexrank')
if algo == 'clexrank':
summarizer_params['continuous'] = True
if algo == 'divrank':
summarizer_params['use_divrank'] = True
elif algo == 'mcp':
summarizer = self.get_summarizer('mcp')
summary, debug_info = summarizer(text, **summarizer_params) # **XXXXX allows any number of keyword arguments and is a dictionary named XXXXX
except Exception as e:
print(str(e))
return json.dumps({'error': str(e)}, ensure_ascii=False, indent=2)
else:
res = json.dumps(
tools.tree_encode({
'summary': summary, 'debug_info': debug_info
}),
ensure_ascii=False, indent=2
)
return res.encode('utf8') #Fix https://stackoverflow.com/questions/20215147/python-cherrypy-500-valueerror-page-handlers-must-return-bytes
if __name__ == '__main__':
options, args = getopt.getopt(sys.argv[1:], 'h:p:')
options = dict(options)
host, port = options['-h'], int(options['-p'])
cherrypy.config.update({
'server.socket_host': host,
'server.socket_port': port
})
conf = {
'/': {
'tools.staticdir.root': path
},
'/summarize': {
'tools.response_headers.on': True,
'tools.response_headers.headers': [
('Content-type', 'application/json')
]
},
'/static': {
'tools.staticdir.on': True,
'tools.staticdir.dir': 'summpy\\server_data', #Win compatible
'tools.response_headers.headers': [
('Content-type', 'application/json')
]
}
}
cherrypy.quickstart(Summarizer(), '/', conf)
lexrank.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys,os
import getopt
import codecs
import collections
import numpy
import networkx
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import pairwise_distances
path = os.getcwd() #Win compatible
sys.path.append(path) #Win compatible
from summpy import tools #Win compatible
from summpy.misc.divrank import divrank, divrank_scipy #Win compatible
def lexrank(sentences, continuous=False, sim_threshold=0.1, alpha=0.9,
use_divrank=False, divrank_alpha=0.25):
# configure ranker
ranker_params = {'max_iter': 1000}
if use_divrank:
ranker = divrank_scipy
ranker_params['alpha'] = divrank_alpha
ranker_params['d'] = alpha
else:
ranker = networkx.pagerank_scipy
ranker_params['alpha'] = alpha
graph = networkx.DiGraph()
# sentence -> tf
sent_tf_list = []
for sent in sentences:
words = tools.word_segmenter_ja(sent)
tf = collections.Counter(words)
sent_tf_list.append(tf)
sent_vectorizer = DictVectorizer(sparse=True)
sent_vecs = sent_vectorizer.fit_transform(sent_tf_list)
# compute similarities between senteces
sim_mat = 1 - pairwise_distances(sent_vecs, sent_vecs, metric='cosine')
if continuous:
linked_rows, linked_cols = numpy.where(sim_mat > 0)
else:
linked_rows, linked_cols = numpy.where(sim_mat >= sim_threshold)
# create similarity graph
graph.add_nodes_from(list(range(sent_vecs.shape[0])))
for i, j in zip(linked_rows, linked_cols):
if i == j:
continue
weight = sim_mat[i,j] if continuous else 1.0
# graph.add_edge(i, j, {'weight': weight}) #Change
graph.add_edge(i, j)
scores = ranker(graph, **ranker_params)
return scores, sim_mat
def summarize(text, sent_limit=None, char_limit=None, imp_require=None,
debug=False, **lexrank_params):
debug_info = {}
sentences = list(tools.sent_splitter_ja(text))
scores, sim_mat = lexrank(sentences, **lexrank_params)
sum_scores = sum(scores.values())
acc_scores = 0.0
indexes = set()
num_sent, num_char = 0, 0
for i in sorted(scores, key=lambda i: scores[i], reverse=True):
num_sent += 1
num_char += len(sentences[i])
if sent_limit is not None and num_sent > sent_limit:
break
if char_limit is not None and num_char > char_limit:
break
if imp_require is not None and acc_scores / sum_scores >= imp_require:
break
indexes.add(i)
acc_scores += scores[i]
if len(indexes) > 0:
summary_sents = [sentences[i] for i in sorted(indexes)]
else:
summary_sents = sentences
if debug:
debug_info.update({
'sentences': sentences, 'scores': scores
})
return summary_sents, debug_info
if __name__ == '__main__':
_usage = '''
Usage:
python lexrank.py -f <file_name> [-e <encoding> ]
[ -v lexrank | clexrank | divrank ]
[ -s <sent_limit> | -c <char_limit> | -i <imp_required> ]
Args:
-f: plain text file to be summarized
-e: input and output encoding (default: utf-8)
-v: variant of LexRank (default is 'lexrank')
-s: summary length (the number of sentences)
-c: summary length (the number of charactors)
-i: cumulative LexRank score [0.0-1.0]
'''.strip()
options, args = getopt.getopt(sys.argv[1:], 'f:e:v:s:c:i:')
options = dict(options)
if len(options) < 2:
print(_usage)
sys.exit(0)
fname = options['-f']
encoding = options['-e'] if '-e' in options else 'utf-8'
variant = options['-v'] if '-v' in options else 'lexrank'
sent_limit = int(options['-s']) if '-s' in options else None
char_limit = int(options['-c']) if '-c' in options else None
imp_require = float(options['-i']) if '-i' in options else None
if fname == 'stdin':
text = '\n'.join(
line for line in sys.stdin.readlines()
).decode(encoding)
else:
text = codecs.open(fname, encoding=encoding).read()
lexrank_params = {}
if variant == 'clexrank':
lexrank_params['continuous'] = True
if variant == 'divrank':
lexrank_params['use_divrank'] = True
sentences, debug_info = summarize(
text, sent_limit=sent_limit, char_limit=char_limit,
imp_require=imp_require, **lexrank_params
)
for sent in sentences:
print(sent.strip().encode(encoding))
tools.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os,sys
import re
import json
path = os.getcwd() #Win compatible
sys.path.append(path) #Win compatible
def tree_encode(obj, encoding='utf-8'):
type_ = type(obj)
if type_ == list or type_ == tuple:
return [tree_encode(e, encoding) for e in obj]
elif type_ == dict:
new_obj = dict(
(tree_encode(k, encoding), tree_encode(v, encoding))
for k, v in obj.items()
)
return new_obj
elif type_ == str: # unicode:⇒str:Automatic conversion to
# return obj.encode(encoding) #Delete
return obj
else:
return obj
def sent_splitter_ja(text, delimiters=set('。.?!\n\r'),
parenthesis='()「」『』“”'):
paren_chars = set(parenthesis)
close2open = dict(list(zip(parenthesis[1::2], parenthesis[0::2])))
pstack = []
buff = []
for i, c in enumerate(text):
c_next = text[i+1] if i+1 < len(text) else None
# check correspondence of parenthesis
if c in paren_chars:
if c in close2open: # close
if len(pstack) > 0 and pstack[-1] == close2open[c]:
pstack.pop()
else: # open
pstack.append(c)
buff.append(c)
if c in delimiters:
if len(pstack) == 0 and c_next not in delimiters:
yield ''.join(buff)
buff = []
if len(buff) > 0:
yield ''.join(buff)
if os.environ.get('SUMMPY_USE_JANOME') is not None:
from summpy.misc.janome_segmenter import word_segmenter_ja
else:
try:
from summpy.misc.mecab_segmenter import word_segmenter_ja
except ImportError:
from summpy.misc.janome_segmenter import word_segmenter_ja
if __name__ == '__main__':
pass
divrank.py(There is no change.)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import networkx as nx
from networkx.exception import NetworkXError
from networkx.utils import not_implemented_for
@not_implemented_for('multigraph')
def divrank(G, alpha=0.25, d=0.85, personalization=None,
max_iter=100, tol=1.0e-6, nstart=None, weight='weight',
dangling=None):
'''
Returns the DivRank (Diverse Rank) of the nodes in the graph.
This code is based on networkx.pagerank.
Args: (diff from pagerank)
alpha: controls strength of self-link [0.0-1.0]
d: the damping factor
Reference:
Qiaozhu Mei and Jian Guo and Dragomir Radev,
DivRank: the Interplay of Prestige and Diversity in Information Networks,
http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.174.7982
'''
if len(G) == 0:
return {}
if not G.is_directed():
D = G.to_directed()
else:
D = G
# Create a copy in (right) stochastic form
W = nx.stochastic_graph(D, weight=weight)
N = W.number_of_nodes()
# self-link (DivRank)
for n in W.nodes_iter():
for n_ in W.nodes_iter():
if n != n_ :
if n_ in W[n]:
W[n][n_][weight] *= alpha
else:
if n_ not in W[n]:
W.add_edge(n, n_)
W[n][n_][weight] = 1.0 - alpha
# Choose fixed starting vector if not given
if nstart is None:
x = dict.fromkeys(W, 1.0 / N)
else:
# Normalized nstart vector
s = float(sum(nstart.values()))
x = dict((k, v / s) for k, v in list(nstart.items()))
if personalization is None:
# Assign uniform personalization vector if not given
p = dict.fromkeys(W, 1.0 / N)
else:
missing = set(G) - set(personalization)
if missing:
raise NetworkXError('Personalization dictionary '
'must have a value for every node. '
'Missing nodes %s' % missing)
s = float(sum(personalization.values()))
p = dict((k, v / s) for k, v in list(personalization.items()))
if dangling is None:
# Use personalization vector if dangling vector not specified
dangling_weights = p
else:
missing = set(G) - set(dangling)
if missing:
raise NetworkXError('Dangling node dictionary '
'must have a value for every node. '
'Missing nodes %s' % missing)
s = float(sum(dangling.values()))
dangling_weights = dict((k, v/s) for k, v in list(dangling.items()))
dangling_nodes = [n for n in W if W.out_degree(n, weight=weight) == 0.0]
# power iteration: make up to max_iter iterations
for _ in range(max_iter):
xlast = x
x = dict.fromkeys(list(xlast.keys()), 0)
danglesum = d * sum(xlast[n] for n in dangling_nodes)
for n in x:
D_t = sum(W[n][nbr][weight] * xlast[nbr] for nbr in W[n])
for nbr in W[n]:
#x[nbr] += d * xlast[n] * W[n][nbr][weight]
x[nbr] += (
d * (W[n][nbr][weight] * xlast[nbr] / D_t) * xlast[n]
)
x[n] += danglesum * dangling_weights[n] + (1.0 - d) * p[n]
# check convergence, l1 norm
err = sum([abs(x[n] - xlast[n]) for n in x])
if err < N*tol:
return x
raise NetworkXError('divrank: power iteration failed to converge '
'in %d iterations.' % max_iter)
def divrank_scipy(G, alpha=0.25, d=0.85, personalization=None,
max_iter=100, tol=1.0e-6, nstart=None, weight='weight',
dangling=None):
'''
Returns the DivRank (Diverse Rank) of the nodes in the graph.
This code is based on networkx.pagerank_scipy
'''
import scipy.sparse
N = len(G)
if N == 0:
return {}
nodelist = G.nodes()
M = nx.to_scipy_sparse_matrix(G, nodelist=nodelist, weight=weight,
dtype=float)
S = scipy.array(M.sum(axis=1)).flatten()
S[S != 0] = 1.0 / S[S != 0]
Q = scipy.sparse.spdiags(S.T, 0, *M.shape, format='csr')
M = Q * M
# self-link (DivRank)
M = scipy.sparse.lil_matrix(M)
M.setdiag(0.0)
M = alpha * M
M.setdiag(1.0 - alpha)
#print M.sum(axis=1)
# initial vector
x = scipy.repeat(1.0 / N, N)
# Personalization vector
if personalization is None:
p = scipy.repeat(1.0 / N, N)
else:
missing = set(nodelist) - set(personalization)
if missing:
raise NetworkXError('Personalization vector dictionary '
'must have a value for every node. '
'Missing nodes %s' % missing)
p = scipy.array([personalization[n] for n in nodelist],
dtype=float)
p = p / p.sum()
# Dangling nodes
if dangling is None:
dangling_weights = p
else:
missing = set(nodelist) - set(dangling)
if missing:
raise NetworkXError('Dangling node dictionary '
'must have a value for every node. '
'Missing nodes %s' % missing)
# Convert the dangling dictionary into an array in nodelist order
dangling_weights = scipy.array([dangling[n] for n in nodelist],
dtype=float)
dangling_weights /= dangling_weights.sum()
is_dangling = scipy.where(S == 0)[0]
# power iteration: make up to max_iter iterations
for _ in range(max_iter):
xlast = x
D_t = M * x
x = (
d * (x / D_t * M * x + sum(x[is_dangling]) * dangling_weights)
+ (1.0 - d) * p
)
# check convergence, l1 norm
err = scipy.absolute(x - xlast).sum()
if err < N * tol:
return dict(list(zip(nodelist, list(map(float, x)))))
raise NetworkXError('divrank_scipy: power iteration failed to converge '
'in %d iterations.' % max_iter)
if __name__ == '__main__':
g = nx.Graph()
# this network appears in the reference.
edges = {
1: [2, 3, 6, 7, 8, 9],
2: [1, 3, 10, 11, 12],
3: [1, 2, 15, 16, 17],
4: [11, 13, 14],
5: [17, 18, 19, 20],
6: [1],
7: [1],
8: [1],
9: [1],
10: [2],
11: [4],
12: [2],
13: [4],
14: [4],
15: [3],
16: [3],
17: [3, 5],
18: [5],
19: [5],
20: [5]
}
for u, vs in edges.items():
for v in vs:
g.add_edge(u, v)
scores = nx.pagerank(g)
print('# PageRank')
print('# rank: node score')
#print sum(scores.values())
for i, n in enumerate(sorted(scores, key=lambda n: scores[n], reverse=True)):
print('# {}: {} {}'.format(i+1, n, scores[n]))
scores = divrank(g)
print('\n# DivRank')
#print sum(scores.values())
print('# rank: node score')
for i, n in enumerate(sorted(scores, key=lambda n: scores[n], reverse=True)):
print('# {}: {} {}'.format(i+1, n, scores[n]))
scores = divrank_scipy(g)
print('\n# DivRank (scipy)')
#print sum(scores.values())
print('# rank: node score')
for i, n in enumerate(sorted(scores, key=lambda n: scores[n], reverse=True)):
print('# {}: {} {}'.format(i+1, n, scores[n]))
mecab_segmenter.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
import MeCab
_mecab = MeCab.Tagger()
#Part of speech,Part of speech subclassification 1,Part of speech subclassification 2,Part of speech subclassification 3,Inflected form,Utilization type,Prototype,reading,pronunciation
_mecab_feat_labels = 'pos cat1 cat2 cat3 conj conj_t orig read pron'.split(' ')
def _mecab_parse_feat(feat):
return dict(list(zip(_mecab_feat_labels, feat.split(','))))
def _mecab_node2seq(node, decode_surface=True, feat_dict=True,
mecab_encoding='utf-8'):
# MeCab.Node cannot change attribute.
while node:
if decode_surface:
# node._surface = node.surface.decode(mecab_encoding)Fix
node._surface = node.surface
if feat_dict: #Save part of speech information with dict
node.feat_dict = _mecab_parse_feat(
# node.feature.decode(mecab_encoding)Fix
node.feature
)
yield node
node = node.next
def is_stopword(n): # <- mecab node
if len(n._surface) == 0:
return True
elif re.search(r'^[\s!-@\[-`\{-~ 、-〜!-@[-`]+$', n._surface): #Corrected to ur⇒r The following three lines
return True
elif re.search(r'^(suffix|Non-independent)', n.feat_dict['cat1']):
return True
elif 'Sahen Suru' == n.feat_dict['conj'] or 'is there' == n.feat_dict['orig']:
return True
elif re.search(r'^(noun|verb|adjective)', n.feat_dict['pos']):
return False
else:
return True
def not_stopword(n): # <- mecab node
return not is_stopword(n)
def node2word(n): # <- mecab node
return n._surface
def node2norm_word(n): # mecab node
if n.feat_dict['orig'] != '*':
return n.feat_dict['orig']
else:
return n._surface
def word_segmenter_ja(sent, node_filter=not_stopword,
node2word=node2norm_word, mecab_encoding='utf-8'):
#if type(sent) == str:Delete
# sent = sent.encode(mecab_encoding)Delete
nodes = list(
_mecab_node2seq(_mecab.parseToNode(sent))
)
if node_filter:
nodes = [n for n in nodes if node_filter(n)]
words = [node2word(n) for n in nodes]
return words
if __name__ == '__main__':
text = 'It's nice weather today, is not it.'
print('|'.join(word_segmenter_ja(text))) #.encode('utf-8')Delete
test.html(Basically, there is no change. Added text size and ajax errors.)
<html>
<head>
<meta charset="UTF-8">
</head>
<body>
<textarea type="text" name="text" rows="20" cols="70"></textarea>
<br>
algorithm (lexrank|clexrank|divrank|mcp): <input type="text" value="lexrank" name="algo" /><br>
length (the number of sentences): <input type="text" value="3" name="sent_limit" /><br>
length (the number of chars): <input type="text" value="" name="char_limit" /><br>
cumulative LexRank score: <input type="text" value="" name="imp_require" /><br>
<button id="summarize">summarize</button>
<br>
<div id="out"></div>
<script src="http://code.jquery.com/jquery-2.0.3.min.js"></script>
<script type="text/javascript">
$(document).ready(function () {
$("#summarize").click(function (e) {
var text = $("textarea[name='text']").val();
var params = {
text: text,
algo: $("input[name='algo']").val(),
sent_limit: $("input[name='sent_limit']").val(),
char_limit: $("input[name='char_limit']").val(),
imp_require: $("input[name='imp_require']").val(),
debug: true
};
$.post("/summarize", params)
.done(function (res) {
var sentences = res.summary;
var debug_info = res.debug_info;
var out = $("#out");
var summ_length = 0;
out.empty();
sentences.forEach(function (s) {
summ_length += s.length;
out.append("<p>" + s + "</p>");
});
var summ_rate = summ_length / text.length;
out.prepend(
'<p style="color:blue">'
+ 'Summary rate: ' + summ_rate
+ ' (' + summ_length + '/' + text.length + 'letter)'
+ '</p>'
);
}).fail((jqXHR, textStatus, errorThrown) => {
alert("error" + jqXHR + "/" + textStatus + "/" + errorThrown)
})
});
});
</script>
</body>
</html>
Other
【server.bat】
python -m summpy.server -h 127.0.0.1 -p 8000
[URL when accessing from a browser]
http://127.0.0.1:8000/static/test.html
I think you should give it to git ... It was a post after a long time.
Recommended Posts