de.folt.models.documentmodel.xliff
Class XliffTokenizer

java.lang.Object
  extended by de.folt.models.documentmodel.xliff.XliffTokenizer

public class XliffTokenizer
extends java.lang.Object

This class tokenizes xliff source, seg-source or target elements into word list Actually it can tokenize any Element. The tokenizer works language independent.

Author:
Klemens

Constructor Summary
XliffTokenizer()
           
XliffTokenizer(XliffDocument xliffDocument)
           
XliffTokenizer(XliffDocument xliffDocument, java.lang.String language)
           
XliffTokenizer(XliffDocument xliffDocument, java.lang.String language, WordHandling wordHandling)
           
XliffTokenizer(XliffDocument xliffDocument, WordHandling wordHandling)
           
 
Method Summary
 java.lang.String getLanguage()
           
 WordHandling getWordHandling()
           
 XliffDocument getXliffDocument()
           
static void main(java.lang.String[] args)
           
 java.lang.String markUpTokens(java.util.Vector<java.lang.String> tokens)
          Constructs a string from the tokens and marks them up
 java.util.Vector<java.lang.String> removeElement(java.util.Vector<java.lang.String> tokens, java.lang.String element)
           
 java.util.Vector<java.lang.String> removeInString(java.util.Vector<java.lang.String> tokens, java.lang.String string)
           
 java.util.Vector<java.lang.String> removeMrkTokens(java.util.Vector<java.lang.String> tokens)
          remove mrk word/stop tokens from vector
 java.util.Vector<java.lang.String> removeString(java.util.Vector<java.lang.String> tokens, java.lang.String string)
           
 void setLanguage(java.lang.String language)
           
 void setWordHandling(WordHandling wordHandling)
           
 void setXliffDocument(XliffDocument xliffDocument)
           
static void test(java.lang.String[] args)
          test simple test method for generating DataModelInstances
 org.jdom.Element tokenize(org.jdom.Element element)
          Tokenize an xliff Element
 java.util.Vector<java.lang.String> tokenize(java.lang.String string)
           
 java.util.Vector<java.lang.String> tokenize(java.lang.String string, java.lang.String language)
          Tokenize a string
 java.util.Vector<java.lang.String> tokenizeToVector(org.jdom.Element element)
           
 java.util.Vector<java.lang.String> tokenizeToVector(org.jdom.Element element, java.lang.String language)
          Tokenize an element to a vector
 java.util.Vector<java.lang.String> tokenizeToVector(java.lang.String string)
           
 java.util.Vector<java.lang.String> tokenizeToVector(java.lang.String string, java.lang.String language)
           
 
Methods inherited from class java.lang.Object
equals, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Constructor Detail

XliffTokenizer

public XliffTokenizer()

XliffTokenizer

public XliffTokenizer(XliffDocument xliffDocument)
Parameters:
xliffDocument -

XliffTokenizer

public XliffTokenizer(XliffDocument xliffDocument,
                      java.lang.String language)

XliffTokenizer

public XliffTokenizer(XliffDocument xliffDocument,
                      java.lang.String language,
                      WordHandling wordHandling)

XliffTokenizer

public XliffTokenizer(XliffDocument xliffDocument,
                      WordHandling wordHandling)
Method Detail

main

public static void main(java.lang.String[] args)
Parameters:
args -

test

public static void test(java.lang.String[] args)
test simple test method for generating DataModelInstances


getLanguage

public java.lang.String getLanguage()

getWordHandling

public WordHandling getWordHandling()

getXliffDocument

public XliffDocument getXliffDocument()
Returns:

markUpTokens

public java.lang.String markUpTokens(java.util.Vector<java.lang.String> tokens)
Constructs a string from the tokens and marks them up

Parameters:
tokens - the vector of tokens
Returns:
string with each token marked up

removeElement

public java.util.Vector<java.lang.String> removeElement(java.util.Vector<java.lang.String> tokens,
                                                        java.lang.String element)
Parameters:
tokens -
element -
Returns:

removeInString

public java.util.Vector<java.lang.String> removeInString(java.util.Vector<java.lang.String> tokens,
                                                         java.lang.String string)
Parameters:
tokens -
string -
Returns:

removeMrkTokens

public java.util.Vector<java.lang.String> removeMrkTokens(java.util.Vector<java.lang.String> tokens)
remove mrk word/stop tokens from vector

Parameters:
tokens - the tokens where to remove word and stop tokens
Returns:
cleaned vector

removeString

public java.util.Vector<java.lang.String> removeString(java.util.Vector<java.lang.String> tokens,
                                                       java.lang.String string)
Parameters:
tokens -
string -
Returns:

setLanguage

public void setLanguage(java.lang.String language)

setWordHandling

public void setWordHandling(WordHandling wordHandling)

setXliffDocument

public void setXliffDocument(XliffDocument xliffDocument)
Parameters:
xliffDocument -

tokenize

public org.jdom.Element tokenize(org.jdom.Element element)
Tokenize an xliff Element

Parameters:
element -
Returns:
Element tokenized

tokenize

public java.util.Vector<java.lang.String> tokenize(java.lang.String string)
Parameters:
string -
Returns:

tokenize

public java.util.Vector<java.lang.String> tokenize(java.lang.String string,
                                                   java.lang.String language)
Tokenize a string

Parameters:
string -
Returns:
a vector of tokens

tokenizeToVector

public java.util.Vector<java.lang.String> tokenizeToVector(org.jdom.Element element)

tokenizeToVector

public java.util.Vector<java.lang.String> tokenizeToVector(org.jdom.Element element,
                                                           java.lang.String language)
Tokenize an element to a vector

Parameters:
element -
Returns:
tokinzed vector

tokenizeToVector

public java.util.Vector<java.lang.String> tokenizeToVector(java.lang.String string)
Parameters:
string -
Returns:

tokenizeToVector

public java.util.Vector<java.lang.String> tokenizeToVector(java.lang.String string,
                                                           java.lang.String language)
Parameters:
string -
language -
Returns: