From Python in Practice: Create Better Programs Using Concurrency
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# http://www.amazon.co.jp/gp/product/B00EO3TRL2
# Python in Practice: Create Better Programs Using Concurrency, Libraries, and Patterns (Developer's Library)
import abc
import re
import HTMLParser
class AbstractWordCounter(object):
__metaclass__ = abc.ABCMeta
@staticmethod
@abc.abstractmethod
def can_count(filename):
pass
@staticmethod
@abc.abstractmethod
def count(filename):
pass
class PlainTextWordCounter(AbstractWordCounter):
@staticmethod
def count(filename):
if not PlainTextWordCounter.can_count(filename):
return 0
regex = re.compile(r"\w+")
total = 0
with open(filename) as readfile:
for line in readfile:
for _ in regex.finditer(line):
total += 1
return total
@staticmethod
def can_count(filename):
return filename.lower().endswith((".py", ".txt"))
class HtmlWordCounter(AbstractWordCounter):
@staticmethod
def count(filename):
if not HtmlWordCounter.can_count(filename):
return 0
parser = MyHTMLParser()
with open(filename) as readfile:
parser.feed(readfile.read())
return parser.count
@staticmethod
def can_count(filename):
return filename.lower().endswith((".html", ".htm"))
class MyHTMLParser(HTMLParser.HTMLParser):
def __init__(self):
HTMLParser.HTMLParser.__init__(self)
self.regex = re.compile(r"\w+")
self.inText = True
self.text = []
self.count = 0
def handle_starttag(self, tag, attrs):
if tag in {"script", "style"}:
self.inText = False
def handle_endtag(self, tag):
if tag in {"script", "style"}:
self.inText = True
else:
for _ in self.regex.findall(" ".join(self.text)):
self.count += 1
self.text = []
def handle_data(self, text):
if self.inText:
text = text.rstrip()
if text:
self.text.append(text)
def count_word(filename):
for wordCounter in (PlainTextWordCounter, HtmlWordCounter):
if wordCounter.can_count(filename):
return wordCounter.count(filename)
c = count_word("/tmp/sample.txt")
print "c=" + str(c) + "\n"
h = count_word("/tmp/sample.html")
print "h=" + str(h) + "\n"
Recommended Posts