package cc.mallet.share.weili.ner.enron;

import cc.mallet.fst.CRF;
import cc.mallet.fst.CRFTrainerByLabelLikelihood;
import cc.mallet.fst.MultiSegmentationEvaluator;
import cc.mallet.fst.ViterbiWriter;
import cc.mallet.pipe.Pipe;
import cc.mallet.pipe.PrintTokenSequenceFeatures;
import cc.mallet.pipe.SerialPipes;
import cc.mallet.pipe.TokenSequence2FeatureVectorSequence;
import cc.mallet.pipe.iterator.FileIterator;
import cc.mallet.pipe.tsf.LexiconMembership;
import cc.mallet.pipe.tsf.OffsetConjunctions;
import cc.mallet.pipe.tsf.RegexMatches;
import cc.mallet.pipe.tsf.TrieLexiconMembership;
import cc.mallet.share.upenn.ner.NEPipes;
import cc.mallet.types.Alphabet;
import cc.mallet.types.InstanceList;
import java.io.File;
import java.io.IOException;
import java.util.Random;
import java.util.regex.Pattern;
import org.springframework.transaction.interceptor.RuleBasedTransactionAttribute;

/* loaded from: input_file:cc/mallet/share/weili/ner/enron/TUI.class */
public class TUI {
    private static String CAPS = "[\\p{Lu}]";
    private static String LOW = "[\\p{Ll}]";
    private static String CAPSNUM = "[\\p{Lu}\\p{Nd}]";
    private static String ALPHA = "[\\p{Lu}\\p{Ll}]";
    private static String ALPHANUM = "[\\p{Lu}\\p{Ll}\\p{Nd}]";
    private static String PUNT = "[,\\.;:?!()]";
    private static String QUOTE = "[\"`']";

    /* JADX WARN: Type inference failed for: r7v82, types: [int[], int[][]] */
    public static void main(String[] strArr) throws IOException {
        SerialPipes serialPipes = new SerialPipes(new Pipe[]{new TrieLexiconMembership(new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "conll/CONLLTWOPER")), new TrieLexiconMembership(new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "conll/CONLLTWOLOC")), new TrieLexiconMembership(new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "conll/CONLLTWOORG")), new TrieLexiconMembership(new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "conll/CONLLTWOMISC"))});
        SerialPipes serialPipes2 = new SerialPipes(new Pipe[]{new TrieLexiconMembership(new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "googlesets/ORGSOCCER")), new TrieLexiconMembership(new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "googlesets/ORGGOVT")), new TrieLexiconMembership(new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "googlesets/ORGNGO")), new TrieLexiconMembership(new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "googlesets/ORGMILITARY")), new TrieLexiconMembership(new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "googlesets/ORGCOMPANY")), new TrieLexiconMembership(new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "googlesets/ORGBANK")), new TrieLexiconMembership(new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "googlesets/ORGTRADE")), new TrieLexiconMembership(new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "googlesets/ORGNEWS")), new TrieLexiconMembership(new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "googlesets/ORGOPERATINGSYSTEM")), new TrieLexiconMembership(new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "googlesets/ORGPOLITICALPARTY")), new TrieLexiconMembership(new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "googlesets/ORGTRAVEL")), new TrieLexiconMembership(new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "googlesets/ORGBASEBALLTEAMAUGF")), new TrieLexiconMembership(new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "googlesets/ORGCARMODEL")), new TrieLexiconMembership(new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "googlesets/ORGCARCOMPANY")), new TrieLexiconMembership(new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "googlesets/ORGENGLISHCOUNTYAUG")), new TrieLexiconMembership(new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "googlesets/ORGUNIVERSITY")), new TrieLexiconMembership(new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "googlesets/MISCNATIONALITYAUGF")), new TrieLexiconMembership(new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "googlesets/MISCDISEASEAUG")), new TrieLexiconMembership(new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "googlesets/MISCTIME")), new TrieLexiconMembership(new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "googlesets/MISCAWARDS")), new TrieLexiconMembership(new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "googlesets/MISCMOVIESAUGF")), new TrieLexiconMembership(new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "googlesets/MISCPOLITICALPARTY")), new TrieLexiconMembership(new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "googlesets/MISCRELIGION")), new TrieLexiconMembership(new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "googlesets/MISCGOVT")), new TrieLexiconMembership(new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "googlesets/MISCWAR")), new TrieLexiconMembership(new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "googlesets/MISCCURRENCY")), new TrieLexiconMembership(new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "googlesets/LOC")), new TrieLexiconMembership(new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "googlesets/PERFL")), new TrieLexiconMembership(new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "googlesets/MISCF")), new TrieLexiconMembership(new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "googlesets/ORGFRAWEDITEDSORTED"))});
        SerialPipes serialPipes3 = new SerialPipes(new Pipe[]{new LexiconMembership("FIRSTHIGHEST", new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "personname/ssdi.prfirsthighest"), true), new LexiconMembership("FIRSTHIGH", new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "personname/ssdi.prfirsthigh"), true), new LexiconMembership("FIRSTMED", new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "personname/ssdi.prfirstmed"), true), new LexiconMembership("FIRSTLOW", new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "personname/ssdi.prfirstlow"), true), new LexiconMembership("LASTHIGHEST", new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "personname/ssdi.prlasthighest"), true), new LexiconMembership("LASTHIGH", new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "personname/ssdi.prlasthigh"), true), new LexiconMembership("LASTMED", new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "personname/ssdi.prlastmed"), true), new LexiconMembership("LASTLOW", new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "personname/ssdi.prlastlow"), true), new LexiconMembership("HONORIFIC", new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "personname/honorifics"), true), new LexiconMembership("NAMESUFFIX", new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "personname/namesuffixes"), true), new LexiconMembership("NAMEPARTICLE", new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "personname/name-particles"), true), new LexiconMembership("DAY", new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "days"), true), new LexiconMembership("MONTH", new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "months"), true), new LexiconMembership("PLACESUFFIX", new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "place-suffixes"), true), new TrieLexiconMembership("COUNTRY", new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "countries"), true), new TrieLexiconMembership("COUNTRYCAPITAL", new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "country-capitals"), true), new TrieLexiconMembership("USSTATE", new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "US-states"), true), new TrieLexiconMembership("COMPANYNAME", new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "company-names"), true), new TrieLexiconMembership("COMPANYSUFFIX", new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "company-suffixes"), true), new TrieLexiconMembership("CONTINENT", new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "continents"), true), new LexiconMembership("STOPWORD", new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "stopwords"), true), new TrieLexiconMembership(new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "biz.yahoo/COMPANYNAME.ABBREV")), new TrieLexiconMembership(new File(String.valueOf("/usr/col/tmp1/weili/Resource/conllDict/") + "utexas/UNIVERSITIES"))});
        SerialPipes serialPipes4 = new SerialPipes(new Pipe[]{new TrieLexiconMembership("IDF_DES", new File(String.valueOf("/usr/col/tmp1/weili/Resource/idfDict/") + "designator.data"), true), new TrieLexiconMembership("IDF_FIR", new File(String.valueOf("/usr/col/tmp1/weili/Resource/idfDict/") + "firstnames.data"), true), new TrieLexiconMembership("IDF_LOC", new File(String.valueOf("/usr/col/tmp1/weili/Resource/idfDict/") + "locations.data"), true), new TrieLexiconMembership("IDF_NAT", new File(String.valueOf("/usr/col/tmp1/weili/Resource/idfDict/") + "nations.data"), true), new TrieLexiconMembership("IDF_ABB", new File(String.valueOf("/usr/col/tmp1/weili/Resource/idfDict/") + "non-final-abbrevs.data"), true), new TrieLexiconMembership("IDF_ORG", new File(String.valueOf("/usr/col/tmp1/weili/Resource/idfDict/") + "organization.data"), true), new TrieLexiconMembership("IDF_PER", new File(String.valueOf("/usr/col/tmp1/weili/Resource/idfDict/") + "person.data"), true)});
        new SerialPipes(new Pipe[]{new RegexMatches("INITCAP", Pattern.compile(String.valueOf(CAPS) + ".*")), new RegexMatches("CAPITALIZED", Pattern.compile(String.valueOf(CAPS) + LOW + "*")), new RegexMatches("ALLCAPS", Pattern.compile(String.valueOf(CAPS) + RuleBasedTransactionAttribute.PREFIX_COMMIT_RULE)), new RegexMatches("MIXEDCAPS", Pattern.compile("[A-Z][a-z]+[A-Z][A-Za-z]*")), new RegexMatches("CONTAINSDIGITS", Pattern.compile(".*[0-9].*")), new RegexMatches("ALLDIGITS", Pattern.compile("[0-9]+")), new RegexMatches("NUMERICAL", Pattern.compile("[-0-9]+[\\.,]+[0-9\\.,]+")), new RegexMatches("MULTIDOTS", Pattern.compile("\\.\\.+")), new RegexMatches("ENDSINDOT", Pattern.compile("[^\\.]+.*\\.")), new RegexMatches("CONTAINSDASH", Pattern.compile(String.valueOf(ALPHANUM) + "+-" + ALPHANUM + "*")), new RegexMatches("ACRO", Pattern.compile("[A-Z][A-Z\\.]*\\.[A-Z\\.]*")), new RegexMatches("LONELYINITIAL", Pattern.compile(String.valueOf(CAPS) + "\\.")), new RegexMatches("SINGLECHAR", Pattern.compile(ALPHA)), new RegexMatches("CAPLETTER", Pattern.compile("[A-Z]")), new RegexMatches("PUNC", Pattern.compile(PUNT)), new RegexMatches("QUOTE", Pattern.compile(QUOTE))});
        SerialPipes serialPipes5 = new SerialPipes(new Pipe[]{new EnronMessage2TokenSequence(), new NEPipes(new File("/usr/col/tmp1/weili/Resource/places")), serialPipes, serialPipes2, serialPipes3, serialPipes4, new OffsetConjunctions(new int[]{new int[]{-1}, new int[]{1}}), new PrintTokenSequenceFeatures(), new TokenSequence2FeatureVectorSequence(true, true)});
        InstanceList instanceList = new InstanceList(serialPipes5);
        instanceList.addThruPipe(new FileIterator("/usr/can/tmp3/weili/NER/Enron/data", FileIterator.STARTING_DIRECTORIES));
        InstanceList[] split = instanceList.split(new Random(1L), new double[]{0.8d, 0.2d});
        Alphabet targetAlphabet = serialPipes5.getTargetAlphabet();
        System.out.print("State labels:");
        for (int i = 0; i < targetAlphabet.size(); i++) {
            System.out.print(" " + targetAlphabet.lookupObject(i));
        }
        System.out.println("");
        System.out.println("Number of features = " + serialPipes5.getDataAlphabet().size());
        CRF crf = new CRF(serialPipes5, (Pipe) null);
        crf.addStatesForThreeQuarterLabelsConnectedAsIn(split[0]);
        CRFTrainerByLabelLikelihood cRFTrainerByLabelLikelihood = new CRFTrainerByLabelLikelihood(crf);
        cRFTrainerByLabelLikelihood.setGaussianPriorVariance(100.0d);
        for (int i2 = 0; i2 < crf.numStates(); i2++) {
            crf.getState(i2).setInitialWeight(Double.NEGATIVE_INFINITY);
        }
        crf.getState("O").setInitialWeight(0.0d);
        System.out.println("Training on " + split[0].size() + " training instances.");
        MultiSegmentationEvaluator multiSegmentationEvaluator = new MultiSegmentationEvaluator(new InstanceList[]{split[0], split[1]}, new String[]{"train", "test"}, new String[]{"B-DATE", "B-TIME", "B-LOCATION", "B-PERSON", "B-ORGANIZATION", "B-ACRONYM", "B-PHONE", "B-MONEY", "B-PERCENT"}, new String[]{"I-DATE", "I-TIME", "I-LOCATION", "I-PERSON", "I-ORGANIZATION", "I-ACRONYM", "I-PHONE", "I-MONEY", "I-PERCENT"});
        if (strArr[0].equals("FeatureInduction")) {
            throw new IllegalStateException("Feature induction not yet supported.");
        }
        if (strArr[0].equals("NoFeatureInduction")) {
            cRFTrainerByLabelLikelihood.train(split[0], 5, new double[]{0.1d, 0.2d, 0.5d, 0.7d});
            while (!cRFTrainerByLabelLikelihood.trainIncremental(split[0])) {
                multiSegmentationEvaluator.evaluate(cRFTrainerByLabelLikelihood);
                if (cRFTrainerByLabelLikelihood.getIteration() % 5 == 0) {
                    new ViterbiWriter(strArr[2], split[0], "train", split[1], "test");
                }
            }
        } else {
            System.err.println("Feature induction or not? Give me a choice.");
            System.exit(1);
        }
        crf.write(new File(strArr[1]));
    }
}
