Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitfe7a5bc

Browse files
committed
feat(transducer): transduce partial char sequences
1 parente4b6a0a commitfe7a5bc

File tree

4 files changed

+55
-11
lines changed

4 files changed

+55
-11
lines changed

‎src/main/java/com/indoqa/fsa/Transducer.java‎

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
importjava.util.ArrayList;
2020
importjava.util.List;
2121

22+
importcom.indoqa.fsa.utils.TokenCandidate;
23+
2224
publicinterfaceTransducer {
2325

2426
/**
@@ -46,7 +48,11 @@ default List<Token> getAllMatches(CharSequence sequence) {
4648
*
4749
* @return All transduced {@link Token tokens}. Never <code>null</code>.
4850
*/
49-
List<Token>getAllMatches(CharSequencesequence,intstart,intlength);
51+
defaultList<Token>getAllMatches(CharSequencesequence,intstart,intlength) {
52+
returnthis.getAllMatches(sequence,start,length,newArrayList<>());
53+
}
54+
55+
List<Token>getAllMatches(CharSequencesequence,intstart,intlength,List<Token>result);
5056

5157
/**
5258
* Find all transducible tokens anywhere in the given <code>sequence</code>.<br/>
@@ -122,6 +128,18 @@ default List<Token> getAllTokens(CharSequence sequence) {
122128
*/
123129
StringgetLongestMatch(CharSequencesequence);
124130

131+
/**
132+
* Find all transducible tokens anywhere in the given <code>sequence</code>.<br/>
133+
* <br/>
134+
* Matches are not required to happen at a token boundary. <br/>
135+
* If tokens overlap, only the longest will be returned. Uses the length of the match (not of the transduction) for comparison.
136+
*
137+
* @param sequence The sequence in which to find tokens.
138+
*
139+
* @return All tokens found. Never <code>null</code>.
140+
*/
141+
List<Token>getLongestOccurrences(CharSequencesequence);
142+
125143
/**
126144
* Find all transducible tokens anywhere in the given <code>sequence</code>.<br/>
127145
* <br/>
@@ -159,4 +177,6 @@ default CharSequence transduce(CharSequence sequence) {
159177
*/
160178
CharSequencetransduce(CharSequencesequence,CharSequencedefaultValue);
161179

180+
CharSequencetransduce(CharSequencesequence,intstart,intlength);
181+
162182
}

‎src/main/java/com/indoqa/fsa/character/CharAcceptor.java‎

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -191,9 +191,7 @@ public List<Token> getLongestTokens(CharSequence sequence, int start, int length
191191
returnTokenCandidate.eliminateOverlapping(this.getAllTokens(sequence,start,length));
192192
}
193193

194-
protectedList<Token>getAllPrefixes(CharSequencesequence,intstart,intlength,charseparator) {
195-
List<Token>result =newArrayList<>();
196-
194+
protectedvoidgetAllPrefixes(CharSequencesequence,intstart,intlength,charseparator,List<Token>result) {
197195
intmatchedLength =0;
198196
intindex =0;
199197
intarc =0;
@@ -214,8 +212,6 @@ protected List<Token> getAllPrefixes(CharSequence sequence, int start, int lengt
214212
result.add(Token.create(start,sequence.subSequence(start,start +matchedLength).toString()));
215213
}
216214
}
217-
218-
returnresult;
219215
}
220216

221217
protectedStringgetInput(intstartIndex) {

‎src/main/java/com/indoqa/fsa/character/CharTransducer.java‎

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,8 @@ public CharTransducer(CharAcceptor charAcceptor, char separator) {
3939
}
4040

4141
@Override
42-
publicList<Token>getAllMatches(CharSequencesequence,intstart,intlength) {
43-
List<Token>result =this.charAcceptor.getAllPrefixes(sequence,start,length,this.separator);
42+
publicList<Token>getAllMatches(CharSequencesequence,intstart,intlength,List<Token>result) {
43+
this.charAcceptor.getAllPrefixes(sequence,start,length,this.separator,result);
4444

4545
CharMatchcharMatch =CharMatch.partialMatchAllowed();
4646
for (Tokentoken :result) {
@@ -97,6 +97,11 @@ public String getLongestMatch(CharSequence sequence) {
9797
returnstringBuilder.toString();
9898
}
9999

100+
@Override
101+
publicList<Token>getLongestOccurrences(CharSequencesequence) {
102+
returnTokenCandidate.eliminateOverlapping(this.getAllOccurrences(sequence));
103+
}
104+
100105
@Override
101106
publicList<Token>getLongestTokens(CharSequencesequence) {
102107
returnTokenCandidate.eliminateOverlapping(this.getAllTokens(sequence));
@@ -112,6 +117,11 @@ public CharSequence transduce(CharSequence sequence, CharSequence defaultValue)
112117
returntransduced;
113118
}
114119

120+
@Override
121+
publicCharSequencetransduce(CharSequencesequence,intstart,intlength) {
122+
returnthis.transduce(sequence,start,length,CharMatch.fullMatchRequired());
123+
}
124+
115125
privateCharSequencetransduce(CharSequencesequence,intstart,intlength,CharMatchmatch) {
116126
this.charAcceptor.getLongestPrefix(sequence,start,length,this.separator,match);
117127

‎src/main/java/com/indoqa/fsa/morfologik/MorfologikTransducer.java‎

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,9 +46,7 @@ protected MorfologikTransducer(Dictionary dictionary, boolean caseSensitive) {
4646
}
4747

4848
@Override
49-
publicList<Token>getAllMatches(CharSequencesequence,intstart,intlength) {
50-
List<Token>result =newArrayList<>();
51-
49+
publicList<Token>getAllMatches(CharSequencesequence,intstart,intlength,List<Token>result) {
5250
byte[]bytes =this.getBytes(sequence,start,length);
5351
Resultmatch =newResult();
5452

@@ -123,6 +121,11 @@ public String getLongestMatch(CharSequence sequence) {
123121
returnreplacement +EncodingUtils.getString(bytes,match.getMatchedLength(),bytes.length -match.getMatchedLength());
124122
}
125123

124+
@Override
125+
publicList<Token>getLongestOccurrences(CharSequencesequence) {
126+
returnTokenCandidate.eliminateOverlapping(this.getAllOccurrences(sequence));
127+
}
128+
126129
@Override
127130
publicList<Token>getLongestTokens(CharSequencesequence) {
128131
returnTokenCandidate.eliminateOverlapping(this.getAllTokens(sequence));
@@ -143,6 +146,21 @@ public CharSequence transduce(CharSequence sequence, CharSequence defaultValue)
143146
returnStandardCharsets.UTF_8.decode(byteBuffer).toString();
144147
}
145148

149+
@Override
150+
publicCharSequencetransduce(CharSequencesequence,intstart,intlength) {
151+
byte[]bytes =this.getBytes(sequence,start,length);
152+
Resultmatch =newResult();
153+
154+
this.traversal.match(match,bytes,0,bytes.length);
155+
if (match.getMatch() !=Match.NON_TERMINAL_MATCH ||match.getMatchedLength() !=sequence.length()) {
156+
returnnull;
157+
}
158+
159+
this.iterator.restartFrom(match.getNode());
160+
ByteBufferbyteBuffer =this.iterator.next();
161+
returnStandardCharsets.UTF_8.decode(byteBuffer).toString();
162+
}
163+
146164
privateTokencreateToken(CharSequencesequence,byte[]bytes,intstart,intlength) {
147165
intcharOffset =EncodingUtils.getString(bytes,0,start).length();
148166

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp