Skip to content
Advertisement

stanford nlp api for java: how to get the name as full not in parts

the aim of my code is to submit a document (be it pdf or doc file) and get all the text in it. give the text to be analysed by stanford nlp. the code works just fine. but suppose there is name in the document eg: “Pardeep Kumar” . the output recieved for it, is as follows:

Pardeep NNP PERSON

Kumar NNP PERSON

but i want it to be like this:

Pardeep Kumar NNP PERSON

how do i do that?how do i check two words placed adjacently that actually make one name or anything similar? how do i not let them be split in different words?

here is my code:

public class readstuff {

      public static void analyse(String data) {

            // creates a StanfordCoreNLP object, with POS tagging, lemmatization, NER, parsing, and coreference resolution
            Properties props = new Properties();
            props.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref");

            StanfordCoreNLP pipeline = new StanfordCoreNLP(props);


            // create an empty Annotation just with the given text
            Annotation document = new Annotation(data);

            // run all Annotators on this text
            pipeline.annotate(document);

            List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);

            // System.out.println("word"+"t"+"POS"+"t"+"NER");
            for (CoreMap sentence : sentences) {

                // traversing the words in the current sentence
                // a CoreLabel is a CoreMap with additional token-specific methods

                for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) {
                    // this is the text of the token
                    String word = token.get(CoreAnnotations.TextAnnotation.class);
                    // this is the POS tag of the token
                    String pos = token.get(CoreAnnotations.PartOfSpeechAnnotation.class);
                    // this is the NER label of the token
                    String ne = token.get(CoreAnnotations.NamedEntityTagAnnotation.class);

                    if(ne.equals("PERSON") || ne.equals("LOCATION") || ne.equals("DATE") )
                    {

                        System.out.format("%32s%10s%16s",word,pos,ne);
                        System.out.println();
                    //System.out.println(word +"       t"+pos +"t"+ne);
                    }

                }
            }
        }

    public static void main(String[] args) throws FileNotFoundException, IOException, TransformerConfigurationException{

        JFileChooser window=new JFileChooser();
        int a=window.showOpenDialog(null);

        if(a==JFileChooser.APPROVE_OPTION){
            String name=window.getSelectedFile().getName();
            String extension = name.substring(name.lastIndexOf(".") + 1, name.length());
            String data = null;

            if(extension.equals("docx")){
                XWPFDocument doc=new XWPFDocument(new FileInputStream(window.getSelectedFile()));
                XWPFWordExtractor extract= new XWPFWordExtractor(doc);
                //System.out.println("docx file reading...");
                data=extract.getText();
                //extract.getMetadataTextExtractor();
            }
            else if(extension.equals("doc")){
                HWPFDocument doc=new HWPFDocument(new FileInputStream(window.getSelectedFile()));
                WordExtractor extract= new WordExtractor(doc);
                //System.out.println("doc file reading...");
                data=extract.getText();
            }
            else if(extension.equals("pdf")){
                //System.out.println(window.getSelectedFile());
                PdfReader reader=new PdfReader(new FileInputStream(window.getSelectedFile()));
                int n=reader.getNumberOfPages();
                for(int i=1;i<n;i++)
                {
                    //System.out.println(data);
                data=data+PdfTextExtractor.getTextFromPage(reader,i );
                }
            }
            else{
                System.out.println("format not supported");
            }

        analyse(data);  
        }
    }



}

Advertisement

Answer

You want to use the entitymentions annotator.

package edu.stanford.nlp.examples;

import edu.stanford.nlp.pipeline.*;
import edu.stanford.nlp.ling.*;
import edu.stanford.nlp.util.*;

import java.util.*;

public class EntityMentionsExample {

  public static void main(String[] args) {
    Annotation document =
        new Annotation("John Smith visited Los Angeles on Tuesday. He left Los Angeles on Wednesday.");
    Properties props = new Properties();
    props.setProperty("annotators", "tokenize,ssplit,pos,lemma,ner,entitymentions");
    StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
    pipeline.annotate(document);

    for (CoreMap sentence : document.get(CoreAnnotations.SentencesAnnotation.class)) {
      for (CoreMap entityMention : sentence.get(CoreAnnotations.MentionsAnnotation.class)) {
        System.out.println(entityMention);
        System.out.println(entityMention.get(CoreAnnotations.EntityTypeAnnotation.class));
      }
    }
  }
}
User contributions licensed under: CC BY-SA
2 People found this is helpful
Advertisement