JavaPG I made a PG that GROPUP the log message stored in db and further morphologically analyze it with Mecab and normalize it.
Tiggercmd.java
import java.io.File;
import java.io.FileInputStream;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.util.Properties;
import oracle.jdbc.*;
import net.moraleboost.mecab.Lattice;
import net.moraleboost.mecab.impl.StandardTagger;
import net.moraleboost.mecab.Node;
class Tiggercmd {
static String[] hinsis = {
"type0","type1","type2","type3","style0","style1","calls0","calls1","calls2"
};
public static void main(String[] args) throws Exception { //Build Tagger.
if (args.length<1) {
System.err.println(
"USAGE:Tiggercmd readwrite-propfileName(non extents)"
);
return;
}
StandardTagger tagger = new StandardTagger("");
//Get version string
System.err.println("MeCab version " + tagger.version());
//Build Lattice (an object that stores runtime information required for morphological analysis)
Lattice lattice = tagger.createLattice();
//Open INFO property
Properties Tiggercmd_props = new Properties();
Tiggercmd_props.load(new FileInputStream(new File(args[0])));
OracleConnection con1 = null;
OracleConnection con2 = null;
PreparedStatement in_sql_prepare = null;
ResultSet in_rls = null;
OraclePreparedStatement con2prpares = null;
int msgid = 0;
int seq =0;
String features[] = null;
try {
//Connect to IN DB
con1 = connect(Tiggercmd_props.getProperty("conids1.user"), Tiggercmd_props.getProperty("conids1.password"),Tiggercmd_props.getProperty("conids1.TNSPINGString"));
//Connect to WRITE DB
con2 = connect(Tiggercmd_props.getProperty("conids2.user"), Tiggercmd_props.getProperty("conids2.password"),Tiggercmd_props.getProperty("conids2.TNSPINGString"));
//Execute readQuery
in_sql_prepare = con1.prepareStatement(Tiggercmd_props.getProperty("in.sqls"));
in_rls = in_sql_prepare.executeQuery();
System.err.println("READSQL:"+in_sql_prepare.toString());
while (in_rls.next() ){
//Set the analysis target character string
String in_surface = in_rls.getString(Tiggercmd_props.getProperty("in.surface_colname"));//String text = "Its a sunny day.";
System.out.println(msgid+":\t"+in_surface);
lattice.setSentence(in_surface);
tagger.parse(lattice);
//Output morphological analysis results
System.out.println(lattice.toString());
//Output surface shape and features while tracing morphemes one by one
Node node = lattice.bosNode();
while (node != null) {
String addssql = Tiggercmd_props.getProperty("wr.sqls");
//Set to parse variables with OraclePreparedStatement
con2prpares = (OraclePreparedStatement)con2.prepareStatement(addssql);
con2prpares.setIntAtName("msgid",msgid);
con2prpares.setStringAtName("msg",in_surface);
con2prpares.setIntAtName("seq",seq);
con2prpares.setStringAtName("surface",node.surface());
con2prpares.setStringAtName("feature",node.feature());
features = node.feature().split(",");
for (int i=0;i<hinsis.length;i++){
con2prpares.setStringAtName(hinsis[i],(i<features.length)?features[i]:null);
}
//Update and update count output
System.err.println("inserts:\t"+con2prpares.executeUpdate());
con2prpares.close();con2prpares=null;
node = node.next();
seq++;
}
con2.commit();
seq = 0;
lattice.clear();
msgid ++;
}
in_rls.close();in_rls=null;
in_sql_prepare.close();in_sql_prepare=null;
con1.close();con1=null;
con2.close();con2=null;
}catch (Exception oe){
oe.printStackTrace(System.err);
}finally {
try {
if (in_rls!=null) in_rls.close();
if (in_sql_prepare!=null) in_sql_prepare.close();
if (con1 != null) con1.close();
if (con2prpares!=null) con2prpares.close();
if (con2 != null) con2.close();
}catch (Exception fe){}
}
// lattice,Destroy tagger
lattice.destroy();
tagger.destroy();
}
/**
* Utility method: creates a new JDBC connection to the database.
*/
static OracleConnection connect(String user,String password,String tnspingString) throws Exception {
OracleConnection conn = null;
OracleDriver dr = new OracleDriver();
Properties prop = new Properties();
prop.setProperty("user",user);
prop.setProperty("password",password);
conn = (OracleConnection)dr.connect("jdbc:oracle:thin:@"+tnspingString,prop);
conn.setAutoCommit(false);
return conn;
}
}
properties
domainlogconrwinfo.properties
#INFO
#Query target connection information
conids1.user=hoge
conids1.password=hoge
conids1.TNSPINGString=(DESCRIPTION = (ADDRESS_LIST = (ADDRESS = (PROTOCOL = TCP)(HOST = 192.168.0.XXX)(PORT = 1521))) (CONNECT_DATA = (SERVER = DEDICATED) (SERVICE_NAME = servicename1)))
#↑ The full connection information output with the tnspings connection name is described.
#INSERT target connection information
conids2.user=hoge
conids2.password=hoge
conids2.TNSPINGString=(DESCRIPTION = (ADDRESS_LIST = (ADDRESS = (PROTOCOL = TCP)(HOST = 192.168.0.***)(PORT = 1521))) (CONNECT_DATA = (SERVER = DEDICATED) (SERVICE_NAME = servicenamea)))
#↑ The full connection information output with the tnspings connection name is described.
in.sqls= \
select a.MESSAGETEXT,count(a.TIMEPARTS) from domainlog a \
where 1=1 \
group by a.MESSAGETEXT \
order by a.MESSAGETEXT
#↑ SQL to get the list of messages you want to analyze
in.surface_colname=MESSAGETEXT
#↑ Column name assigned to the message in that SQL
#↓ INSERT SQL
wr.sqls= \
insert into domainlog_rexs (msgid,MESSAGETEXT,seq,surface,manals \
,type0,type1,type2,type3,style0,style1,calls0,calls1,calls2 ) \
values(:msgid,:msg,:seq,:surface,:feature, \
:type0,:type1,:type2,:type3,:style0,:style1,:calls0,:calls1,:calls2 \
)
#↑ If the following morphological analysis data variables are described in VALUES according to the column, the following variables will be set there.
#:msgid parse message serial number
#:msg Analysis message string
#:seq Analysis morpheme serial number
#:surface Analysis morpheme surface string
#:feature Feature (part of speech?) System analysis result character string
#:type0 IPA Type1
#:type1 IPA Type2
#:type2 IPA Type3
#:type3 IPA Type4
#:style0 type 1
#:style1 type 2
#:calls0 reading 1
#:calls1 reading 2
#:calls2 reading 3
#Reference link https://qiita.com/priancho/items/f633a9e99616b3fe451f
Use LISTAGG (character SUM catalog function) for character summary
python
--MSGID,List the combined CALLS1 in the analysis results for each MSGTEXT
select a.MESSAGETEXT,a.MSGID,a.CONCATHINSIWORD,a.ITEMS from (
select a.MSGID,a.MESSAGETEXT,b.ITEMS
,listagg(a.CALLS1,';') within group (order by a.MSGID,a.SEQ) as concatHINSIWORD
from domainlog_rexs a
,(select a.MSGID,listagg(a.SURFACE,';') within group (order by a.SEQ) as items
from domainlog_rexs a
where 1=1
and a.SURFACE is not null
and a.TYPE0='noun'
and a.TYPE1 in (
'General'
,'number'
,'Change connection'
)
group by a.MSGID) b
where 1=1
and ( a.CALLS0 <> '*' and a.TYPE0 is not null)
and a.MSGID=b.MSGID
group by a.MSGID,a.MESSAGETEXT,b.ITEMS
) a
order by a.MSGID
result: The 10,000-stroke log message has been reduced to thousands by grouping simple messages. Furthermore, the grouping of the morphological analysis results in terms of wording reduced the number to less than 100 putters. As a result, what is the number of subjects that people pay attention to?
Concern: Useful wisdom created by these researchers (in this example, the morphological analysis engine Mecab) Take in and make it practical without being swept away by fashion Are there Japanese vendors or famous software / hardware development / manufacturing / sales companies? Is it because changing the existing interest structure is the most dangerous behavior for us? What will happen by the Tokyo Olympics if this cannot be broken? I am worried as a Japanese engineer.
Recommended Posts