[PYTHON] Use with Cabocha to automatically generate "IOB2 tag corpus" learning data

https://gist.github.com/jpena930/0753edfd27e010503755ccfdaeb965bf

#coding: utf-8
from __future__ import print_function  # Only needed for Python 2
import MeCab
import CaboCha
import sys
import os


cabocha = CaboCha.Parser("-f1 -n1")
m = MeCab.Tagger ("-Ochasen")

# For reading from file
class getWords():
    def readText(self, filename):
        ###Extract the file
        with open(filename, 'r', encoding='utf-8') as f:
            tText = f.read()
            f.close()
        return tText

#Usage: python training_generator <text file>
with open(sys.argv[1], 'r') as my_file:
    text = my_file.read()


getText = getWords()
#file_output = '<Filename>'

file_output = sys.argv[1]

text = getText.readText(file_output)

cabocha_text = cabocha.parseToString(text)
cabocha_text = cabocha_text.replace("B-ORGANIZATION", "B-ORG")
cabocha_text = cabocha_text.replace("I-ORGANIZATION", "I-ORG")
cabocha_text = cabocha_text.replace("B-ARTIFACT", "B-ART")
cabocha_text = cabocha_text.replace("I-ARTIFACT", "I-ART")
cabocha_text = cabocha_text.replace("B-LOCATION", "B-LOC")
cabocha_text = cabocha_text.replace("I-LOCATION", "I-LOC")
cabocha_text = cabocha_text.replace("B-DATE", "B-DAT")
cabocha_text = cabocha_text.replace("I-DATE", "I-DAT")
cabocha_text = cabocha_text.replace("B-TIME", "B-TIM")
cabocha_text = cabocha_text.replace("I-TIME", "I-TIM")
cabocha_text = cabocha_text.replace("B-PERSON", "B-PSN")
cabocha_text = cabocha_text.replace("I-PERSON", "I-PSN")
cabocha_text = cabocha_text.replace("B-MONEY", "B-MNY")
cabocha_text = cabocha_text.replace("I-MONEY", "I-MNY")
cabocha_text = cabocha_text.replace("B-PERCENT", "B-PNT")
cabocha_text = cabocha_text.replace("I-PERCENT", "I-PNT")


#Remove commas and replace with tab
cabocha_text = cabocha_text.replace(",", "\t")

filename = file_output + '_generated.txt'

if os.path.exists(filename):
    os.remove(filename)

# Remove * and add line space
for line in cabocha_text.splitlines():
    if not line.startswith('*'):
        with open(filename, 'a') as f:
            print(line, file=f)
    if line.startswith('。'):
        with open(filename, 'a') as f:
            print("", file=f)

readFile = open(filename)

lines = readFile.readlines()
lines = lines[:-1]

readFile.close()

w = open(filename,'w')
w.writelines([item for item in lines[:-1]])
w.close()

Next Step: Fix tags to suit your needs

Reference: http://qiita.com/Hironsan/items/326b66711eb4196aa9d4 https://github.com/Hironsan/IOB2Corpus