Wednesday, October 15, 2014

Testing Open NLP Name Entity Recognizer model using java

First you have to train a NER model to test or you can directly download existing models from this link
Please refer my previous blog post on training a model.


Then you can load your NER model and test the sentences by the following code that I tried. This code was used to extract locations from sentences and print in the terminal. first you have to tokenize your sentence. therefore you can do it by using a model or manually. Here I have used a trained "en-token.bin" OpenNLP model for that. you can also use split command if you wand to do it manually. but I strongly recommend to use a model for tokenization.
After that you can load your NER model and recognize tokens. Here I have loaded my previously trained model "en-location.bin" for that.



import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;


import opennlp.tools.namefind.NameFinderME;

import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.Span;


public class train {

 static String sentence = "RT @Madhawa:there is traffic in Kandy";

 
 public String[] findLocation(String sentence){ 
  String[] stream =new String[6];
  InputStream modelInToken = null;
  InputStream modelInLocation = null;
  InputStream modelInLevel = null;
  
  
  try {
   // convert sentence into tokens
   
   //load premodeled tokenizer model
   modelInToken = new FileInputStream("en-token.bin");
   TokenizerModel modelToken = new TokenizerModel(modelInToken);
   Tokenizer tokenizer = new TokenizerME(modelToken);
   String tokens[] = tokenizer.tokenize(sentence);
   for(int i=0; i< tokens.length ; i++)
    System.out.println(tokens[i]);
    
   //find locations
   //load location model
   modelInLocation = new FileInputStream("en-location.bin");
   TokenNameFinderModel modelLocation = new TokenNameFinderModel(modelInLocation);
   NameFinderME locationFinder = new NameFinderME(modelLocation);
   Span nameSpans[] = locationFinder.find(tokens);
   //find probabilities for names
   double[] spanProbs1 = locationFinder.probs(nameSpans);
   //3. print names
   int index=0;
   int i=0;
   for( i = 0; i< nameSpans.length ; i++ ) {
    if (index !=5){
     stream[i]= tokens[nameSpans[i].getStart()];
    }
    
  // System.out.println("Span: "+ nameSpans[i].toString());
   System.out.println("Covered text is: "+tokens[nameSpans[i].getStart()]);
   System.out.println("Probability is: "+spanProbs1[i]);
   }
   
  } catch (FileNotFoundException e) {
   // TODO Auto-generated catch block
   e.printStackTrace();
  } catch (InvalidFormatException e) {
   // TODO Auto-generated catch block
   e.printStackTrace();
  } catch (IOException e) {
   // TODO Auto-generated catch block
   e.printStackTrace();
  }
  
  return stream;
  }
 
 public static void main(String[] args) {
  new train().findLocation(sentence);
 }

}

1 comment: