de.folt.util
Class WordHandling

java.lang.Object
  extended by de.folt.util.WordHandling

public class WordHandling
extends java.lang.Object

This class implements several methods dealing with word handling, e.g. splitting up segments into words. The default split string is defined as:

 defaultSplitString = "\\s" + "|[" + Pattern.quote(".*()[]:;,'#+=?!$%&\"") + "]";
 

Author:
klemens

Constructor Summary
WordHandling()
           
 
Method Summary
 java.lang.String getDefaultSplitString()
           
 java.lang.String getDefaultSplitStringDE()
           
 java.lang.String getDefaultSplitStringEN()
           
 java.lang.String getDefaultSplitStringES()
           
static java.lang.String getDefaultWordSplitChars()
           
 java.util.Hashtable<java.lang.String,java.lang.String> getLanguageDefaultWordSplitChars()
           
 java.lang.String getSplitChars(java.lang.String language)
           
 void init()
          Initialise the word split chars
 boolean isbXmlMode()
           
static void main(java.lang.String[] args)
           
 java.lang.String[] segmentToWordArray(java.lang.String string)
          segmentToWordArray segments a string into an array of words using defaultSplitString (Pattern.quote(".*()[]:;,'#+=?!
 java.lang.String[] segmentToWordArray(java.lang.String string, java.lang.String language)
          segmentToWordArray segments a string into an array of words based on a language
 void setbXmlMode(boolean bXmlMode)
           
 void setDefaultSplitString(java.lang.String defaultSplitString)
           
 void setDefaultSplitStringDE(java.lang.String defaultSplitStringDE)
           
 void setDefaultSplitStringEN(java.lang.String defaultSplitStringEN)
           
 void setDefaultSplitStringES(java.lang.String defaultSplitStringES)
           
static void setDefaultWordSplitChars(java.lang.String defaultWordSplitChars)
           
 void setLanguageDefaultWordSplitChars(java.util.Hashtable<java.lang.String,java.lang.String> languageDefaultWordSplitChars)
           
 void setlanguageWordSplitChars(java.lang.String language, java.lang.String splitChars)
          Add or replace a split character for a language
static java.lang.String stem(java.lang.String text, java.lang.String language)
           
 
Methods inherited from class java.lang.Object
equals, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Constructor Detail

WordHandling

public WordHandling()
Method Detail

getDefaultWordSplitChars

public static java.lang.String getDefaultWordSplitChars()

main

public static void main(java.lang.String[] args)

setDefaultWordSplitChars

public static void setDefaultWordSplitChars(java.lang.String defaultWordSplitChars)

getDefaultSplitString

public java.lang.String getDefaultSplitString()
Returns:
the defaultSplitString

getDefaultSplitStringDE

public java.lang.String getDefaultSplitStringDE()

getDefaultSplitStringEN

public java.lang.String getDefaultSplitStringEN()

getDefaultSplitStringES

public java.lang.String getDefaultSplitStringES()

getLanguageDefaultWordSplitChars

public java.util.Hashtable<java.lang.String,java.lang.String> getLanguageDefaultWordSplitChars()

getSplitChars

public java.lang.String getSplitChars(java.lang.String language)
Parameters:
language -
Returns:

init

public void init()
Initialise the word split chars


isbXmlMode

public boolean isbXmlMode()

segmentToWordArray

public java.lang.String[] segmentToWordArray(java.lang.String string)
segmentToWordArray segments a string into an array of words using defaultSplitString (Pattern.quote(".*()[]:;,'#+=?!$%&\"{}<>"))

Parameters:
string - the string to segment
Returns:
the array with all the words

segmentToWordArray

public java.lang.String[] segmentToWordArray(java.lang.String string,
                                             java.lang.String language)
segmentToWordArray segments a string into an array of words based on a language

Parameters:
string - the string to segment
language - language to use (e.g. de-de; will search first for de-de and then for de; if nothing found defaultSplitString will be used (Pattern.quote(".*()[]:;,'#+=?!$%&\"{}"))
Returns:
the array with all the words

setbXmlMode

public void setbXmlMode(boolean bXmlMode)

setDefaultSplitString

public void setDefaultSplitString(java.lang.String defaultSplitString)

setDefaultSplitStringDE

public void setDefaultSplitStringDE(java.lang.String defaultSplitStringDE)

setDefaultSplitStringEN

public void setDefaultSplitStringEN(java.lang.String defaultSplitStringEN)

setDefaultSplitStringES

public void setDefaultSplitStringES(java.lang.String defaultSplitStringES)

setLanguageDefaultWordSplitChars

public void setLanguageDefaultWordSplitChars(java.util.Hashtable<java.lang.String,java.lang.String> languageDefaultWordSplitChars)

setlanguageWordSplitChars

public void setlanguageWordSplitChars(java.lang.String language,
                                      java.lang.String splitChars)
Add or replace a split character for a language

Parameters:
language - the language code
splitChars - the chars to use (will be Quoted for regular Expressions and \s from regular expression added)

stem

public static java.lang.String stem(java.lang.String text,
                                    java.lang.String language)
Parameters:
text -
language -
Returns: