|
||||||||||
| PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
| SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD | |||||||||
java.lang.Objectcom.sun.labs.minion.pipeline.StageAdapter
com.sun.labs.minion.document.tokenizer.Tokenizer
com.sun.labs.minion.document.tokenizer.JCCTokenizer
public class JCCTokenizer
| Field Summary | |
|---|---|
protected java.lang.StringBuilder |
buildUp
A place to build up strings across tokens, if we need to. |
protected boolean |
isNgram
Is the data that we've built up for an ngram tokenized language? |
Token |
jj_nt
|
protected static java.lang.String |
logTag
|
protected java.util.regex.Pattern |
noBreakChars
A regular expression pattern of characters for which we should not break tokens. |
protected static java.lang.String |
PROP_NO_BREAK_CHARS
|
protected CharArrayReader |
reader
A reusable reader for the characters that we'll be passed. |
Token |
token
|
JCCTokenizerTokenManager |
token_source
|
| Fields inherited from class com.sun.labs.minion.document.tokenizer.Tokenizer |
|---|
dataSaved, indexed, logger, makeTokens, maxTokLen, pos, PROP_SEND_PUNCT, PROP_SEND_WHITE, saveData, savedData, savedLen, sendPunct, sendWhite, trimSpaces, wordNum |
| Fields inherited from class com.sun.labs.minion.pipeline.StageAdapter |
|---|
downstream, name |
| Fields inherited from interface com.sun.labs.minion.document.tokenizer.JCCTokenizerConstants |
|---|
DEFAULT, EOF, NGRAMTOKEN, NONSPACESEPCHAR, PUNCTUATION, SPACESEPCHAR, SPACESEPCHAR1, SPACESEPCHAR2, SPACESEPCHAR3, SPACESEPCHAR4, SPACESEPCHAR5, SPACESEPCHAR6, SPACESEPCHAR7, SPACESEPCHAR8, SPACESEPCHAR9, SPACESEPTOKEN, tokenImage, WHITECHAR, WHITESPACE |
| Constructor Summary | |
|---|---|
JCCTokenizer()
|
|
JCCTokenizer(java.io.InputStream stream)
|
|
JCCTokenizer(java.io.InputStream stream,
java.lang.String encoding)
|
|
JCCTokenizer(JCCTokenizerTokenManager tm)
|
|
JCCTokenizer(java.io.Reader stream)
|
|
JCCTokenizer(Stage downstream)
Creates a JavaCC tokenizer that will not send punctuation to the downstream stage. |
|
JCCTokenizer(Stage downstream,
boolean sendPunct)
Creates a JavaCC tokenizer. |
|
| Method Summary | |
|---|---|
void |
disable_tracing()
|
void |
enable_tracing()
|
void |
flush()
Flushes any collected tokens. |
ParseException |
generateParseException()
|
Token |
getNextToken()
|
Token |
getToken(int index)
|
Tokenizer |
getTokenizer(Stage s,
boolean sp)
Gets a tokenizer that we can use in the query parser. |
void |
handleLongChar(char c,
int b,
int l)
Handles a character that takes up more than one character in a file. |
void |
newProperties(com.sun.labs.util.props.PropertySheet ps)
|
boolean |
next()
End of autogenerated rules. |
void |
ReInit(java.io.InputStream stream)
|
void |
ReInit(java.io.InputStream stream,
java.lang.String encoding)
|
void |
ReInit(JCCTokenizerTokenManager tm)
|
void |
ReInit(java.io.Reader stream)
|
void |
send()
Sends the built up token, if there is one. |
protected void |
sendToken(java.lang.String t,
int type)
|
void |
setNoBreakChars(java.lang.String nbcPattern)
|
void |
text(char[] text,
int b,
int e)
Tokenize the given text. |
| Methods inherited from class com.sun.labs.minion.document.tokenizer.Tokenizer |
|---|
dump, endDocument, endField, getPos, handleFieldData, reset, reset, shutdown, startDocument, startField |
| Methods inherited from class com.sun.labs.minion.pipeline.StageAdapter |
|---|
defineField, getDownstream, getName, punctuation, savedData, setDownstream, setName, token |
| Methods inherited from class java.lang.Object |
|---|
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait |
| Field Detail |
|---|
protected CharArrayReader reader
protected java.lang.StringBuilder buildUp
protected boolean isNgram
@ConfigString(defaultValue="") protected static java.lang.String PROP_NO_BREAK_CHARS
protected java.util.regex.Pattern noBreakChars
protected static java.lang.String logTag
public JCCTokenizerTokenManager token_source
public Token token
public Token jj_nt
| Constructor Detail |
|---|
public JCCTokenizer()
public JCCTokenizer(Stage downstream)
downstream - the stage downstream of the tokenizer.
public JCCTokenizer(Stage downstream,
boolean sendPunct)
downstream - the stage downstream of the tokenizer.sendPunct - if true, punctuation and whitespace will
be passed to the downstream stage.public JCCTokenizer(java.io.InputStream stream)
public JCCTokenizer(java.io.InputStream stream,
java.lang.String encoding)
public JCCTokenizer(java.io.Reader stream)
public JCCTokenizer(JCCTokenizerTokenManager tm)
| Method Detail |
|---|
public void text(char[] text,
int b,
int e)
Tokenizer
text in interface Stagetext in interface PipelineStagetext in class Tokenizertext - The text to tokenize.b - The beginning position in the text buffer.e - The ending position in the text buffer.
public void handleLongChar(char c,
int b,
int l)
Tokenizer
handleLongChar in class Tokenizerc - The characterb - The beginning position of the character in the document.l - The length of the character in the document.
public Tokenizer getTokenizer(Stage s,
boolean sp)
Tokenizer
getTokenizer in class Tokenizerpublic void flush()
Tokenizer
flush in class Tokenizer
protected void sendToken(java.lang.String t,
int type)
public void send()
public void setNoBreakChars(java.lang.String nbcPattern)
public void newProperties(com.sun.labs.util.props.PropertySheet ps)
throws com.sun.labs.util.props.PropertyException
newProperties in interface com.sun.labs.util.props.ConfigurablenewProperties in class Tokenizercom.sun.labs.util.props.PropertyException
public final boolean next()
throws ParseException
ParseExceptionpublic void ReInit(java.io.InputStream stream)
public void ReInit(java.io.InputStream stream,
java.lang.String encoding)
public void ReInit(java.io.Reader stream)
public void ReInit(JCCTokenizerTokenManager tm)
public final Token getNextToken()
public final Token getToken(int index)
public ParseException generateParseException()
public final void enable_tracing()
public final void disable_tracing()
|
||||||||||
| PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
| SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD | |||||||||