22 package cz.vutbr.fit.knot.annotations.fragmentUpdater;
26 import java.util.ArrayList;
27 import java.util.Iterator;
28 import javax.xml.xpath.XPathExpressionException;
29 import org.w3c.dom.Node;
45 STATIC, CHARACTER, CHARACTER_BIDIRECTIONALLY,
WORD, FIRST_AND_LAST_WORD_THEN_LETTER
124 case CHARACTER_BIDIRECTIONALLY:
125 tm =
"Character_bidirectionally";
130 case FIRST_AND_LAST_WORD_THEN_LETTER:
131 tm =
"First_and_last_word_then_letter";
150 int beginIndex, endIndex;
151 String nodeSubstring =
"", XPathString;
161 beginIndex = queryFragment.getOffset();
162 endIndex = beginIndex + queryFragment.getLength();
169 nodeSubstring =
getNodeContent(node).substring(beginIndex, endIndex);
173 XPathString = XPathHelper.XPathStringOfNode(node);
177 queryFragment.getLength(), nodeSubstring);
183 case CHARACTER_BIDIRECTIONALLY:
185 int beginIndexL, beginIndexR, endIndexL, endIndexR;
186 beginIndexL = beginIndexR = queryFragment.getOffset();
187 endIndexL = endIndexR = beginIndexL + queryFragment.getLength();
192 beginIndexL = beginIndexR = endIndexL - queryFragment.getLength();
196 boolean endIteration =
false;
198 while(!endIteration){
203 nodeSubstring =
getNodeContent(node).substring(beginIndexR, endIndexR);
207 XPathString = XPathHelper.XPathStringOfNode(node);
211 queryFragment.getLength(), nodeSubstring);
215 endIteration =
false;
219 if(beginIndexL >= 0){
220 nodeSubstring =
getNodeContent(node).substring(beginIndexL, endIndexL);
224 XPathString = XPathHelper.XPathStringOfNode(node);
228 queryFragment.getLength(), nodeSubstring);
232 endIteration =
false;
240 endIndex = beginIndex + queryFragment.getLength();
244 nodeSubstring =
getNodeContent(node).substring(beginIndex, endIndex);
248 XPathString = XPathHelper.XPathStringOfNode(node);
252 queryFragment.getLength(), nodeSubstring);
262 case FIRST_AND_LAST_WORD_THEN_LETTER:
263 ArrayList<IndexAndlength> firstWordsHit =
new ArrayList<IndexAndlength>();
264 ArrayList<IndexAndlength> lastWordsHit =
new ArrayList<IndexAndlength>();
293 if (node.getNodeType() == Node.TEXT_NODE || node.getNodeType() == Node.CDATA_SECTION_NODE) {
294 return node.getNodeValue();
310 throws XPathExpressionException {
311 String words[] = newFragment.getText().split(
" ");
312 int minBadnessBeg, minBadnessEnd, badness;
313 minBadnessBeg = badness = Integer.MAX_VALUE;
315 String origText = origFragment.getText();
316 int origOffset = origFragment.getOffset();
318 String origNewText = newFragment.getText();
319 int origNewOffset = newFragment.getOffset();
320 String origNewXpath = newFragment.getXPathString();
322 int maxIterations = words.length;
330 ArrayList<UpdatableFragment> BestSubParts =
new ArrayList<UpdatableFragment>();
333 while(index < maxIterations){
335 String removedFromBeg = origNewText.substring(newOffset);
338 int LD = Util.levenshtein(origText, removedFromBeg);
339 double LPerc = (LD / (double)origText.length()) * 100;
341 badness = 5* (int)LPerc + Math.abs(origOffset - newOffset);
344 if(badness > minBadnessBeg){
348 minBadnessBeg = badness;
352 minBadnessEnd = Integer.MAX_VALUE;
355 while(index2 < (maxIterations-index)){
357 newPart = removedFromBeg.substring(0,
countWordsLengthEnd(index2, words, removedFromBeg.length()));
359 int LD2 = Util.levenshtein(origText, newPart);
360 double LPerc2 = (LD2 / (double)origText.length()) * 100;
362 badness = 5 * (int)LPerc2 + Math.abs(origOffset - newOffset);
365 if(badness > minBadnessEnd){
370 minBadnessEnd = badness;
372 minOffset = newOffset;
378 BestSubParts.add(
new UpdatableFragment(origNewXpath, origNewOffset+minOffset, minPart.length(), minPart));
388 if(BestSubParts.size() > 1){
392 else if(BestSubParts.size() == 1){
393 return BestSubParts.get(0);
409 for(
int i = 0; i < count; i++){
410 offset += words[i].length()+1;
424 int offset = wholeLength;
425 for(
int i = words.length-1; i >= words.length - count; i--){
426 offset -= words[i].length()+1;
440 ArrayList<UpdatableFragment> BestSubParts)
441 throws XPathExpressionException{
442 if(BestSubParts.isEmpty()){
445 else if(BestSubParts.size() == 1){
446 return BestSubParts.get(0);
449 int minBadness = Integer.MAX_VALUE;
451 String originalText = origFragment.getText();
452 int orignalOffset = origFragment.getOffset();
454 Iterator<UpdatableFragment> itFrg = BestSubParts.iterator();
462 while(itFrg.hasNext()){
463 comparedfragment = itFrg.next();
465 int LD = Util.levenshtein(originalText, comparedfragment.getText());
466 double LPerc = (LD / (double)originalText.length()) * 100;
468 int badness = 5 * (int)LPerc +
469 Math.abs(orignalOffset - comparedfragment.getOffset());
470 if(badness < minBadness){
471 minBadness = badness;
477 return BestSubParts.get(bestIndex);
489 throws XPathExpressionException
493 StringBuilder nodeSb;
495 boolean match =
false;
496 StringBuilder newFragment =
new StringBuilder();
497 int newFragmentOffset = 0;
500 ArrayList<UpdatableFragment> bestFragments =
new ArrayList<UpdatableFragment>();
503 for (
int i = 0; i <= words.length - queryFragment.getWordCount(); i++) {
504 nodeSb =
new StringBuilder();
506 wordOffset += words[i-1].length()+1;
510 for (
int j = 0; j < queryFragment.getWordCount(); j++) {
511 nodeSb.append(words[i + j]);
512 if (j != queryFragment.getWordCount() - 1) {
520 newFragment =
new StringBuilder(nodeSb.toString());
521 newFragmentOffset = wordOffset;
525 newFragment.append(
' ');
526 newFragment.append(words[i + queryFragment.getWordCount()-1]);
530 else if(match ==
true){
534 newFragment.length(), newFragment.toString());
541 if(bestFragments.size() > 1){
545 else if(bestFragments.size() == 1){
546 return bestFragments.get(0);
566 ArrayList<IndexAndlength> firstWordsHit, ArrayList<IndexAndlength> lastWordsHit)
567 throws XPathExpressionException
569 if(queryFragment.getWordCount() <= 1){
576 String[] fragmentWords = queryFragment.getText().split(
" ");
578 String XPath = XPathHelper.XPathStringOfNode(node);
581 String firstWord = fragmentWords[0];
583 String lastWord = fragmentWords[fragmentWords.length-1];
586 IndexAndlength IaL =
new IndexAndlength();
588 int maxLength = (int) (queryFragment.getLength() * this.maxLenghtEnlargement);
590 int minLength = (int) (queryFragment.getLength() * this.maxLenghtReduction);
593 ArrayList<UpdatableFragment> bestFragments =
new ArrayList<UpdatableFragment>();
596 while(IaL.index < words.length){
597 IaL.index =
matchFirstWord(words, firstWord, lastWord, IaL, firstWordsHit, lastWordsHit);
599 if(IaL.index != null){
602 maxLength, minLength, XPath);
605 bestFragments.add(fr);
607 IaL.offset += words[IaL.index-1].length()+1;
629 private Integer
matchFirstWord(String[] nodeWords, String firstWord, String lastWord,
630 IndexAndlength IaL, ArrayList<IndexAndlength> firstWordsHit, ArrayList<IndexAndlength> lastWordsHit)
632 for (
int i = IaL.index; i <= nodeWords.length-1; i++) {
634 IaL.offset += nodeWords[i-1].length()+1;
637 IaL.comparedWordLength = nodeWords[i].length();
638 IndexAndlength IaLHit =
new IndexAndlength(i, IaL.comparedWordLength, IaL.offset);
639 firstWordsHit.add(IaLHit);
643 IndexAndlength IaLHit =
new IndexAndlength(i, nodeWords[i].length(), IaL.offset);
644 lastWordsHit.add(IaLHit);
664 IndexAndlength IaL,
int maxLength,
int minLength, String XPath)
throws XPathExpressionException
667 int maxSize = maxLength - (IaL.comparedWordLength);
670 for (
int i = IaL.index; i <= nodeWords.length-1; i++) {
671 actualSize += nodeWords[i].length()+1;
672 if(actualSize > maxSize){
677 if(fragmentString.length() < minLength){
681 return new UpdatableFragment(XPath, IaL.offset, fragmentString.length(), fragmentString);
699 StringBuilder sb =
new StringBuilder();
700 for(
int i = firstWordIndex; i <= lastWordIndex; i++){
702 if(i < lastWordIndex){
706 return sb.toString();
723 ArrayList<IndexAndlength> firstWordsHit, ArrayList<IndexAndlength> lastWordsHit)
724 throws XPathExpressionException
726 if(firstWordsHit.isEmpty() && lastWordsHit.isEmpty()){
731 String[] fragmentWords = queryFragment.getText().split(
" ");
732 String XPath = XPathHelper.XPathStringOfNode(node);
734 String firstWord = fragmentWords[0];
735 String lastWord = fragmentWords[fragmentWords.length-1];
737 ArrayList<UpdatableFragment> matchedFragments =
new ArrayList<UpdatableFragment>();
739 int maxLength = (int) (queryFragment.getLength() * this.maxLenghtEnlargement);
740 int minLength = (int) (queryFragment.getLength() * this.maxLenghtReduction);
742 Iterator<IndexAndlength> it = firstWordsHit.iterator();
750 matchedFragments.add(fr);
754 it = lastWordsHit.iterator();
760 matchedFragments.add(fr);
765 if(matchedFragments.isEmpty()){
766 it = firstWordsHit.iterator();
771 fr =
matchMostSimiliar(words, IaL, maxLength, minLength, XPath, queryFragment.getText());
773 matchedFragments.add(fr);
794 IndexAndlength IaL,
int maxLength,
int minLength, String XPath)
795 throws XPathExpressionException
798 int maxSize = maxLength - (IaL.comparedWordLength);
801 char lastLetter = word.charAt(word.length()-1);
803 for(
int i = IaL.index+1; i < nodeWords.length; i++){
804 actualSize += nodeWords[i].length()+1;
805 if(actualSize > maxSize){
808 if(nodeWords[i].charAt(nodeWords[i].length()-1) == lastLetter) {
810 if(fragmentString.length() < minLength){
814 return new UpdatableFragment(XPath, IaL.offset, fragmentString.length(), fragmentString);
835 IndexAndlength IaL,
int maxLength,
int minLength, String XPath)
836 throws XPathExpressionException
839 int maxSize = maxLength - (IaL.comparedWordLength);
842 char firstLetter = word.charAt(0);
844 for(
int i = IaL.index-1; i >= 0; i--){
845 actualSize += nodeWords[i].length()+1;
846 if(actualSize > maxSize){
849 IaL.offset -= nodeWords[i].length()+1;
850 if(nodeWords[i].charAt(0) == firstLetter) {
852 if(fragmentString.length() < minLength){
856 return new UpdatableFragment(XPath, IaL.offset, fragmentString.length(), fragmentString);
877 IndexAndlength IaL,
int maxLength,
int minLength, String XPath, String originalFragment)
878 throws XPathExpressionException
880 int distance = Integer.MAX_VALUE;
882 int maxSize = maxLength - (IaL.comparedWordLength);
884 String bestFragment = null;
886 for(
int i = IaL.index+1; i < nodeWords.length; i++){
887 actualSize += nodeWords[i].length()+1;
888 if(actualSize > maxSize){
893 if((newDistance =
Util.
levenshtein(fragmentString, originalFragment)) <= distance){
894 distance = newDistance;
895 bestFragment = fragmentString;
898 else if(bestFragment.length() >= minLength){
899 return new UpdatableFragment(XPath, IaL.offset, bestFragment.length(), bestFragment);
902 if(bestFragment == null || bestFragment.length() < minLength){
906 return new UpdatableFragment(XPath, IaL.offset, bestFragment.length(), bestFragment);
IndexAndlength(Integer index, int comparedWordLength, int offset)
Generic abstract class for compare methods.
CompareMethod compareMethod
double maxLenghtReduction
int countWordsLengthBeg(int count, String words[])
Integer matchFirstWord(String[] nodeWords, String firstWord, String lastWord, IndexAndlength IaL, ArrayList< IndexAndlength > firstWordsHit, ArrayList< IndexAndlength > lastWordsHit)
static String XPathStringOfNode(Node node)
static int levenshtein(String s, String t)
Auxiliary class with index, offset and word length.
int countWordsLengthEnd(int count, String words[], int wholeLength)
UpdatableFragment selectBestPart(UpdatableFragment origFragment, UpdatableFragment newFragment)
UpdatableFragment matchMostSimiliar(String[] nodeWords, IndexAndlength IaL, int maxLength, int minLength, String XPath, String originalFragment)
Comparator(CompareMethod compareMethod, TraversingMethod traversingMethod, double maxLenghtEnlargement, double maxLenghtReduction)
UpdatableFragment wordMatchMethod(Node node, UpdatableFragment queryFragment)
UpdatableFragment matchLastWord(String[] nodeWords, String word, IndexAndlength IaL, int maxLength, int minLength, String XPath)
Class for XML document fragment.
Comparator(CompareMethod compareMethod, TraversingMethod traversingMethod)
Names of traversing methods.
TraversingMethod traversingMethod
Class consisting of traversing method and compare method.
abstract boolean compare(String first, String second)
static String getNodeContent(Node node)
UpdatableFragment firstAndLastLetterMatchMethod(Node node, UpdatableFragment queryFragment, ArrayList< IndexAndlength > firstWordsHit, ArrayList< IndexAndlength > lastWordsHit)
TraversingMethod getTraversingMethod()
Helper class with util XPath methods.
double maxLenghtEnlargement
UpdatableFragment compare(UpdatableFragment queryFragment, Node node)
UpdatableFragment matchSameFirstLetter(String[] nodeWords, String word, IndexAndlength IaL, int maxLength, int minLength, String XPath)
Utility class (manipulates RFC 3339 dates)
CompareMethod getCompareMethod()
String createFragmentStringFromWords(String[] words, int firstWordIndex, int lastWordIndex)
UpdatableFragment matchSameLastLetter(String[] nodeWords, String word, IndexAndlength IaL, int maxLength, int minLength, String XPath)
UpdatableFragment firstAndLastWordMatchMethod(Node node, UpdatableFragment queryFragment, ArrayList< IndexAndlength > firstWordsHit, ArrayList< IndexAndlength > lastWordsHit)
UpdatableFragment selectBestSubFragment(UpdatableFragment origFragment, ArrayList< UpdatableFragment > BestSubParts)
void setmaxLenghtEnlargement(double maxLenghtEnlargement)