4A Server -  2.0
 All Classes Namespaces Files Functions Variables Enumerator
Linearizer.java
Go to the documentation of this file.
1 /*
2  * Project: Server for annotations sharing
3  * Author: Ing. Jaroslav Dytrych idytrych@fit.vutbr.cz
4  * File: Linearizer.java
5  * Description: This class contains utility functions for document linearization.
6  */
7 
8 /**
9  * @file Linearizer.java
10  *
11  * @brief Utility functions for document linearization.
12  */
13 package cz.vutbr.fit.knot.annotations.comet;
14 
21 import java.io.IOException;
22 import java.util.ArrayList;
23 import java.util.Iterator;
24 import javax.xml.parsers.ParserConfigurationException;
25 import org.w3c.dom.Document;
26 import org.w3c.dom.DocumentFragment;
27 import org.w3c.dom.Node;
28 import org.w3c.dom.NodeList;
29 import org.w3c.dom.traversal.DocumentTraversal;
30 import org.w3c.dom.traversal.NodeFilter;
31 import org.w3c.dom.traversal.NodeIterator;
32 import org.xml.sax.SAXException;
33 
34 /**
35  * This class contains utility functions for document linearization.
36  *
37  * @brief Utility functions for document linearization.
38  * @author idytrych
39  */
40 public class Linearizer {
41 
42  /**
43  * Matcher provider for linearizing of fragments
44  */
46 
47  /**
48  * Convert list of linearized fragments to common fragments.
49  * For each linearized fragment more common fragments can be created.
50  *
51  * @param linFragments List of linearized fragments. Fragments must be sorted
52  * by offset!
53  * @param doc Document in which fragments are
54  * @param badFragments List to which bad fragments will be stored
55  * @return Returns list of lists of common fragments
56  */
57  public static ArrayList<ArrayList<Fragment>> linFragmentsToFragments(ArrayList<Fragment> linFragments,
58  Document doc,
59  ArrayList<ArrayList<Fragment>> badFragments) {
60  ArrayList<ArrayList<Fragment>> retFragments = new ArrayList<ArrayList<Fragment>>();
61  ArrayList<Integer> processedLengths = new ArrayList<Integer>();
62  int linLength = linFragments.size(); // number of linearized fragments
63  for (int i = 0; i < linLength; i++) { // create lists for results
64  retFragments.add(new ArrayList<Fragment>());
65  badFragments.add(new ArrayList<Fragment>());
66  processedLengths.add(0);
67  }
68  if (doc == null) {
69  return null;
70  }
71  if (doc.getDocumentElement() == null) {
72  return null;
73  } else { // if document element is presented
74  NodeIterator nIter = ((DocumentTraversal) doc).createNodeIterator(doc.getDocumentElement(),
75  NodeFilter.SHOW_TEXT + NodeFilter.SHOW_CDATA_SECTION, null, true);
76  Integer nStartOffset = 0; // start offset of current node
77  Integer nEndOffset = 0; // end offset of current node
78  String nContent = ""; // content of current node
79  int convertedLin = 0; // index of last fully converted fragment
80  Node currentNode = nIter.nextNode();
81  int newlineOffsetCompensation = 0; // offset increment for new line
82 
83  while (currentNode != null && convertedLin < linLength) {
84  // while there is next node and there is something to convert
85  nContent = currentNode.getNodeValue();
86  nEndOffset = nStartOffset + nContent.length();
87  int i = convertedLin;
88  Fragment lFr = null;
89  do {
90  lFr = linFragments.get(i);
91  // modify the offset according to the new line character count
92  lFr.setOffset(lFr.getOffset() - newlineOffsetCompensation);
93  int lFrEnd = lFr.getOffset() + lFr.getLength();
94  if (lFr.getOffset() < nStartOffset && lFrEnd < nStartOffset) {
95  // if fragment is already converted
96  if (i == convertedLin) { // if offset in list can be moved
97  convertedLin++;
98  }
99  } else if (lFr.getOffset() >= nStartOffset && lFrEnd <= nEndOffset) {
100  // if fragment is fully contained
101  String path = XPathHelper.XPathStringOfNode(currentNode);
102  Integer offset = lFr.getOffset() - nStartOffset;
103  String content = nContent.substring(offset, offset + lFr.getLength());
104  Fragment f = new Fragment(path, offset, lFr.getLength(), content, null);
105  if (lFr.getAnnotatedText().equals(content)) {
106  // if fragment content is matching
107  retFragments.get(i).add(f); // add fragment to the appropriate list
108  } else {
109  badFragments.get(i).add(f); // add fragment to the appropriate list
110  }
111  processedLengths.set(i, lFr.getLength());
112  if (i == convertedLin) { // if offset in list can be moved
113  convertedLin++;
114  }
115  } else if (lFr.getOffset() >= nStartOffset && lFr.getOffset() < nEndOffset
116  && lFrEnd > nEndOffset) { // if start of fragment is contained
117  String path = XPathHelper.XPathStringOfNode(currentNode);
118  Integer offset = lFr.getOffset() - nStartOffset;
119  Integer length = nContent.length() - offset;
120  String content = nContent.substring(offset, nContent.length());
121  Fragment f = new Fragment(path, offset, length, content, null);
122  int lFTL = lFr.getAnnotatedText().length();
123  if (length <= lFTL && lFr.getAnnotatedText().substring(0, length).equals(content)) {
124  retFragments.get(i).add(f); // add fragment to the appropriate list
125  } else {
126  badFragments.get(i).add(f); // add fragment to the appropriate list
127  }
128  processedLengths.set(i, length);
129  } else if (lFr.getOffset() < nStartOffset && lFrEnd > nStartOffset
130  && lFrEnd <= nEndOffset) { // if end of fragment is contained
131  String path = XPathHelper.XPathStringOfNode(currentNode);
132  Integer offset = 0;
133  Integer length = lFrEnd - nStartOffset;
134  String content = nContent.substring(0, lFrEnd - nStartOffset);
135  int lFTL = lFr.getAnnotatedText().length();
136  Fragment f = new Fragment(path, offset, length, content, null);
137  if (processedLengths.get(i) < lFTL
138  && lFr.getAnnotatedText().substring(processedLengths.get(i), lFTL).equals(content)) {
139  retFragments.get(i).add(f); // add fragment to the appropriate list
140  } else {
141  badFragments.get(i).add(f); // add fragment to the appropriate list
142  }
143  processedLengths.set(i, processedLengths.get(i) + length);
144  if (i == convertedLin) { // if offset in list can be moved
145  convertedLin++;
146  }
147  } else if (lFr.getOffset() < nStartOffset && lFrEnd > nEndOffset) {
148  // if part of fragment is contained
149  String path = XPathHelper.XPathStringOfNode(currentNode);
150  Integer offset = 0;
151  Integer length = nContent.length();
152  int lFTL = lFr.getAnnotatedText().length();
153  int lFTE = processedLengths.get(i) + length;
154  Fragment f = new Fragment(path, offset, length, nContent, null);
155  if (processedLengths.get(i) < lFTL && lFTE <= lFTL &&
156  lFr.getAnnotatedText().substring(processedLengths.get(i), lFTE).equals(nContent)) {
157  retFragments.get(i).add(f); // add fragment to the appropriate list
158  } else {
159  badFragments.get(i).add(f); // add fragment to the appropriate list
160  }
161  processedLengths.set(i, processedLengths.get(i) + length);
162  } else {
163  // fragment has no matching text in the current node
164  }
165  // reverse the offset modification
166  lFr.setOffset(lFr.getOffset() + newlineOffsetCompensation);
167 
168  i++;
169  } while (i < linLength && lFr.getOffset() < nEndOffset);
170 
171  nStartOffset += nContent.length(); // start offset of next node
172  currentNode = nIter.nextNode(); // move to next node of document
173  newlineOffsetCompensation += 1; // increment the compensation factor
174  } // while there is next node and there is something to convert
175  } // if document element is presented
176 
177  return retFragments;
178  } // linFragmentsToFragments()
179 
180  /**
181  * Convert list of linearized fragments to common fragments.
182  * For each linearized fragment more common fragments can be created.
183  *
184  * @param linFragments List of linearized fragments. Fragments must be sorted
185  * by offset!
186  * @param doc Document in which fragments are
187  * @param badFragments List to which bad fragments will be stored
188  * @return Returns list of lists of common fragments
189  */
190  public static ArrayList<ArrayList<SuggestionFragment>> linSugFragmentsToFragments(ArrayList<SuggestionFragment> linFragments,
191  Document doc,
192  ArrayList<ArrayList<SuggestionFragment>> badFragments) {
193  ArrayList<ArrayList<SuggestionFragment>> retFragments = new ArrayList<ArrayList<SuggestionFragment>>();
194  ArrayList<Integer> processedLengths = new ArrayList<Integer>();
195  int linLength = linFragments.size(); // number of linearized fragments
196  for (int i = 0; i < linLength; i++) { // create lists for results
197  retFragments.add(new ArrayList<SuggestionFragment>());
198  badFragments.add(new ArrayList<SuggestionFragment>());
199  processedLengths.add(0);
200  }
201  if (doc == null) {
202  return null;
203  }
204  if (doc.getDocumentElement() == null) {
205  return null;
206  } else { // if document element is presented
207  NodeIterator nIter = ((DocumentTraversal) doc).createNodeIterator(doc.getDocumentElement(),
208  NodeFilter.SHOW_TEXT + NodeFilter.SHOW_CDATA_SECTION, null, true);
209  Integer nStartOffset = 0; // start offset of current node
210  Integer nEndOffset = 0; // end offset of current node
211  String nContent = ""; // content of current node
212  int convertedLin = 0; // index of last fully converted fragment
213  Node currentNode = nIter.nextNode();
214  int newlineOffsetCompensation = 0; // offset increment for new line
215 
216  while (currentNode != null && convertedLin < linLength) {
217  // while there is next node and there is something to convert
218  nContent = currentNode.getNodeValue();
219 
220  // if the node is a non breaking space, we can say that it doesn't
221  // contain any suggestion, so we can speed up the evaluation
222  if (nContent.replaceAll("[\\s\\u00A0]+$", "").contentEquals("")) {
223  currentNode = nIter.nextNode(); // move to next node of document
224  continue;
225  }
226 
227  nEndOffset = nStartOffset + nContent.length();
228  int i = convertedLin;
229  SuggestionFragment lFr = null;
230  do {
231  lFr = linFragments.get(i);
232  // modify the offset according to the new line character count
233  lFr.setOffset(lFr.getOffset() - newlineOffsetCompensation);
234  int lFrEnd = lFr.getOffset() + lFr.getLength();
235  if (lFr.getOffset() < nStartOffset && lFrEnd < nStartOffset) {
236  // if fragment is already converted
237  if (i == convertedLin) { // if offset in list can be moved
238  convertedLin++;
239  }
240  } else if (lFr.getOffset() >= nStartOffset && lFrEnd <= nEndOffset) {
241  // if fragment is fully contained
242  String path = XPathHelper.XPathStringOfNode(currentNode);
243  Integer offset = lFr.getOffset() - nStartOffset;
244  String content = nContent.substring(offset, offset + lFr.getLength());
245  SuggestionFragment f = new SuggestionFragment(path, offset, lFr.getLength(), content, null);
246  if (lFr.getAnnotatedText().equals(content)) {
247  // if fragment content is matching
248  retFragments.get(i).add(f); // add fragment to the appropriate list
249  } else {
250  badFragments.get(i).add(f); // add fragment to the appropriate list
251  }
252  processedLengths.set(i, lFr.getLength());
253  if (i == convertedLin) { // if offset in list can be moved
254  convertedLin++;
255  }
256  } else if (lFr.getOffset() >= nStartOffset && lFr.getOffset() < nEndOffset
257  && lFrEnd > nEndOffset) { // if start of fragment is contained
258  String path = XPathHelper.XPathStringOfNode(currentNode);
259  Integer offset = lFr.getOffset() - nStartOffset;
260  Integer length = nContent.length() - offset;
261  String content = nContent.substring(offset, nContent.length());
262  SuggestionFragment f = new SuggestionFragment(path, offset, length, content, null);
263  int lFTL = lFr.getAnnotatedText().length();
264  if (length <= lFTL && lFr.getAnnotatedText().substring(0, length).equals(content)) {
265  retFragments.get(i).add(f); // add fragment to the appropriate list
266  } else {
267  badFragments.get(i).add(f); // add fragment to the appropriate list
268  }
269  processedLengths.set(i, length);
270  } else if (lFr.getOffset() < nStartOffset && lFrEnd > nStartOffset
271  && lFrEnd <= nEndOffset) { // if end of fragment is contained
272  String path = XPathHelper.XPathStringOfNode(currentNode);
273  Integer offset = 0;
274  Integer length = lFrEnd - nStartOffset;
275  String content = nContent.substring(0, lFrEnd - nStartOffset);
276  int lFTL = lFr.getAnnotatedText().length();
277  SuggestionFragment f = new SuggestionFragment(path, offset, length, content, null);
278  if (processedLengths.get(i) < lFTL
279  && lFr.getAnnotatedText().substring(processedLengths.get(i), lFTL).equals(content)) {
280  retFragments.get(i).add(f); // add fragment to the appropriate list
281  } else {
282  badFragments.get(i).add(f); // add fragment to the appropriate list
283  }
284  processedLengths.set(i, processedLengths.get(i) + length);
285  if (i == convertedLin) { // if offset in list can be moved
286  convertedLin++;
287  }
288  } else if (lFr.getOffset() < nStartOffset && lFrEnd > nEndOffset) {
289  // if part of fragment is contained
290  String path = XPathHelper.XPathStringOfNode(currentNode);
291  Integer offset = 0;
292  Integer length = nContent.length();
293  int lFTL = lFr.getAnnotatedText().length();
294  int lFTE = processedLengths.get(i) + length;
295  SuggestionFragment f = new SuggestionFragment(path, offset, length, nContent, null);
296  if (processedLengths.get(i) < lFTL && lFTE <= lFTL &&
297  lFr.getAnnotatedText().substring(processedLengths.get(i), lFTE).equals(nContent)) {
298  retFragments.get(i).add(f); // add fragment to the appropriate list
299  } else {
300  badFragments.get(i).add(f); // add fragment to the appropriate list
301  }
302  processedLengths.set(i, processedLengths.get(i) + length);
303  } else {
304  // fragment has no matching text in the current node
305  }
306  // reverse the offset modification
307  lFr.setOffset(lFr.getOffset() + newlineOffsetCompensation);
308 
309  i++;
310  } while (i < linLength && lFr.getOffset() < nEndOffset);
311 
312  nStartOffset += nContent.length(); // start offset of next node
313  currentNode = nIter.nextNode(); // move to next node of document
314  newlineOffsetCompensation += 1; // increment the compensation factor
315  } // while there is next node and there is something to convert
316  } // if document element is presented
317 
318  return retFragments;
319  } // linSugFragmentsToFragments()
320 
321  /**
322  * Convert list of linearized fragments to common fragments.
323  * For each linearized fragment more common fragments can be created.
324  *
325  * @param linFragments List of linearized fragments. Fragments must be sorted
326  * by offset!
327  * @param doc Document in which fragments are
328  * @param badFragments List to which bad fragments will be stored
329  * @return Returns list of lists of common fragments
330  */
331  public static ArrayList<ArrayList<AlternativeFragment>> linAltFragmentsToFragments(ArrayList<AlternativeFragment> linFragments,
332  Document doc,
333  ArrayList<ArrayList<AlternativeFragment>> badFragments) {
334  ArrayList<ArrayList<AlternativeFragment>> retFragments = new ArrayList<ArrayList<AlternativeFragment>>();
335  ArrayList<Integer> processedLengths = new ArrayList<Integer>();
336  int linLength = linFragments.size(); // number of linearized fragments
337  for (int i = 0; i < linLength; i++) { // create lists for results
338  retFragments.add(new ArrayList<AlternativeFragment>());
339  badFragments.add(new ArrayList<AlternativeFragment>());
340  processedLengths.add(0);
341  }
342  if (doc == null) {
343  return null;
344  }
345  if (doc.getDocumentElement() == null) {
346  return null;
347  } else { // if document element is presented
348  NodeIterator nIter = ((DocumentTraversal) doc).createNodeIterator(doc.getDocumentElement(),
349  NodeFilter.SHOW_TEXT + NodeFilter.SHOW_CDATA_SECTION, null, true);
350  Integer nStartOffset = 0; // start offset of current node
351  Integer nEndOffset = 0; // end offset of current node
352  String nContent = ""; // content of current node
353  int convertedLin = 0; // index of last fully converted fragment
354  Node currentNode = nIter.nextNode();
355  int newlineOffsetCompensation = 0; // offset increment for new line
356 
357  while (currentNode != null && convertedLin < linLength) {
358  // while there is next node and there is something to convert
359  nContent = currentNode.getNodeValue();
360 
361  // if the node is a non breaking space, we can say that it doesn't
362  // contain any suggestion, so we can speed up the evaluation
363  if (nContent.replaceAll("[\\s\\u00A0]+$", "").contentEquals("")) {
364  currentNode = nIter.nextNode(); // move to next node of document
365  continue;
366  }
367 
368  nEndOffset = nStartOffset + nContent.length();
369  int i = convertedLin;
370  AlternativeFragment lFr = null;
371  do {
372  lFr = linFragments.get(i);
373  // modify the offset according to the new line character count
374  lFr.setOffset(lFr.getOffset() - newlineOffsetCompensation);
375  int lFrEnd = lFr.getOffset() + lFr.getLength();
376  if (lFr.getOffset() < nStartOffset && lFrEnd < nStartOffset) {
377  // if fragment is already converted
378  if (i == convertedLin) { // if offset in list can be moved
379  convertedLin++;
380  }
381  } else if (lFr.getOffset() >= nStartOffset && lFrEnd <= nEndOffset) {
382  // if fragment is fully contained
383  String path = XPathHelper.XPathStringOfNode(currentNode);
384  Integer offset = lFr.getOffset() - nStartOffset;
385  String content = nContent.substring(offset, offset + lFr.getLength());
386  AlternativeFragment f = new AlternativeFragment(path, offset, lFr.getLength(), content, null);
387  if (lFr.getAnnotatedText().equals(content)) {
388  // if fragment content is matching
389  retFragments.get(i).add(f); // add fragment to the appropriate list
390  } else {
391  badFragments.get(i).add(f); // add fragment to the appropriate list
392  }
393  processedLengths.set(i, lFr.getLength());
394  if (i == convertedLin) { // if offset in list can be moved
395  convertedLin++;
396  }
397  } else if (lFr.getOffset() >= nStartOffset && lFr.getOffset() < nEndOffset
398  && lFrEnd > nEndOffset) { // if start of fragment is contained
399  String path = XPathHelper.XPathStringOfNode(currentNode);
400  Integer offset = lFr.getOffset() - nStartOffset;
401  Integer length = nContent.length() - offset;
402  String content = nContent.substring(offset, nContent.length());
403  AlternativeFragment f = new AlternativeFragment(path, offset, length, content, null);
404  int lFTL = lFr.getAnnotatedText().length();
405  if (length <= lFTL && lFr.getAnnotatedText().substring(0, length).equals(content)) {
406  retFragments.get(i).add(f); // add fragment to the appropriate list
407  } else {
408  badFragments.get(i).add(f); // add fragment to the appropriate list
409  }
410  processedLengths.set(i, length);
411  } else if (lFr.getOffset() < nStartOffset && lFrEnd > nStartOffset
412  && lFrEnd <= nEndOffset) { // if end of fragment is contained
413  String path = XPathHelper.XPathStringOfNode(currentNode);
414  Integer offset = 0;
415  Integer length = lFrEnd - nStartOffset;
416  String content = nContent.substring(0, lFrEnd - nStartOffset);
417  int lFTL = lFr.getAnnotatedText().length();
418  AlternativeFragment f = new AlternativeFragment(path, offset, length, content, null);
419  if (processedLengths.get(i) < lFTL
420  && lFr.getAnnotatedText().substring(processedLengths.get(i), lFTL).equals(content)) {
421  retFragments.get(i).add(f); // add fragment to the appropriate list
422  } else {
423  badFragments.get(i).add(f); // add fragment to the appropriate list
424  }
425  processedLengths.set(i, processedLengths.get(i) + length);
426  if (i == convertedLin) { // if offset in list can be moved
427  convertedLin++;
428  }
429  } else if (lFr.getOffset() < nStartOffset && lFrEnd > nEndOffset) {
430  // if part of fragment is contained
431  String path = XPathHelper.XPathStringOfNode(currentNode);
432  Integer offset = 0;
433  Integer length = nContent.length();
434  int lFTL = lFr.getAnnotatedText().length();
435  int lFTE = processedLengths.get(i) + length;
436  AlternativeFragment f = new AlternativeFragment(path, offset, length, nContent, null);
437  if (processedLengths.get(i) < lFTL && lFTE <= lFTL &&
438  lFr.getAnnotatedText().substring(processedLengths.get(i), lFTE).equals(nContent)) {
439  retFragments.get(i).add(f); // add fragment to the appropriate list
440  } else {
441  badFragments.get(i).add(f); // add fragment to the appropriate list
442  }
443  processedLengths.set(i, processedLengths.get(i) + length);
444  } else {
445  // fragment has no matching text in the current node
446  }
447  // reverse the offset modification
448  lFr.setOffset(lFr.getOffset() + newlineOffsetCompensation);
449 
450  i++;
451  } while (i < linLength && lFr.getOffset() < nEndOffset);
452 
453  nStartOffset += nContent.length(); // start offset of next node
454  currentNode = nIter.nextNode(); // move to next node of document
455  newlineOffsetCompensation += 1; // increment the compensation factor
456  } // while there is next node and there is something to convert
457  } // if document element is presented
458 
459  return retFragments;
460  } // linAltFragmentsToFragments()
461 
462  /**
463  * Linearizes document to string
464  *
465  * @param doc Document to linearize
466  * @return Returns linearized document in string
467  */
468  public static String linearizeDocument(Document doc) {
469  StringBuilder linDoc = new StringBuilder();
470  if (doc.getDocumentElement() == null) {
471  return null;
472  } else { // if document element is presented
473  NodeIterator nIter = ((DocumentTraversal) doc).createNodeIterator(doc.getDocumentElement(),
474  NodeFilter.SHOW_TEXT + NodeFilter.SHOW_CDATA_SECTION, null, true);
475  Node currentNode = nIter.nextNode();
476  // flag represents new line offset compensation
477  boolean compensateNewline = false;
478  while (currentNode != null) {
479  // the node containg only non breaking space character, therefore there
480  // can't be any annotation it it, skip it
481 
482  if (currentNode.getNodeValue() == null || currentNode.getNodeValue().replaceAll("[\\s\\u00A0]+$", "").contentEquals("")) {
483  currentNode = nIter.nextNode(); // move to next node of document
484  continue;
485  }
486  // append the space character after the last character of the node text data
487  // only if the value node is not a non breaking space character
488  if (compensateNewline) {
489  // this solution resolves the problem with joining words delimited
490  // by the <br> tag
491  linDoc.append(" ");
492  }
493  linDoc.append(currentNode.getNodeValue());
494  currentNode = nIter.nextNode(); // move to next node of document
495  compensateNewline = true;
496  }
497  } // if document element is presented
498  return linDoc.toString();
499  } // linearizeDocument()
500 
501  /**
502  * Convert list of common fragments to linearized fragments.
503  * For more common fragments one linearized fragment can be created.
504  *
505  * @param comFragments List of common fragments.
506  * @param doc Document in which fragments are
507  * @param notConverted List for fragments, which was not converted
508  * @param addSpaces If true, it will add space after each node (offset will be incremented by 1)
509  * @return Returns list of linearized fragments
510  */
511  public static ArrayList<SuggestionFragment> fragmentsToLinSugFragments(ArrayList<ArrayList<SuggestionFragment>> comFragments,
512  Document doc, ArrayList<ArrayList<SuggestionFragment>> notConverted,
513  boolean addSpaces) {
514 
515  // create copy of array from which fragments will be removed and count fragments
516  int fragCount = 0;
517  int numOfAnnots = comFragments.size();
518 
519  ArrayList<ArrayList<SuggestionFragment>> comFrCopy = new ArrayList<ArrayList<SuggestionFragment>>();
520  // array of linearized fragments that will be returned by this method
521  ArrayList<SuggestionFragment> retFragments = new ArrayList<SuggestionFragment>(numOfAnnots);
522  // auxiliary array containing fragments from previous document node
523  ArrayList<SuggestionFragment> partialFragments = new ArrayList<SuggestionFragment>(numOfAnnots);
524  // auxiliary array containing number of spaces to add before start offset
525  ArrayList<Integer> numsOfSpacesSO = new ArrayList<Integer>(numOfAnnots);
526  // auxiliary array containing number of spaces to add into length
527  ArrayList<Integer> numsOfSpacesL = new ArrayList<Integer>(numOfAnnots);
528 
529  for (int i = 0; i < numOfAnnots; i++) {
530  ArrayList<SuggestionFragment> cFr = comFragments.get(i);
531  ArrayList<SuggestionFragment> aFragments = new ArrayList<SuggestionFragment>();
532  notConverted.add(new ArrayList<SuggestionFragment>());
533  comFrCopy.add(aFragments);
534  for (Iterator<SuggestionFragment> aFrIt = cFr.iterator(); aFrIt.hasNext();) {
535  SuggestionFragment fr = aFrIt.next();
536  aFragments.add(fr);
537  fragCount++;
538  }
539  // these two arrays should be initialized here, because their size is
540  // equal to the number of annotations
541  retFragments.add(null);
542  partialFragments.add(null);
543  numsOfSpacesSO.add(null);
544  numsOfSpacesL.add(null);
545  }
546 
547  if (doc == null) {
548  return null;
549  }
550  if (doc.getDocumentElement() == null) {
551  return null;
552  } else { // if document element is presented
553  NodeIterator nIter = ((DocumentTraversal) doc).createNodeIterator(doc.getDocumentElement(),
554  NodeFilter.SHOW_TEXT + NodeFilter.SHOW_CDATA_SECTION, null, true);
555  Integer nStartOffset = 0; // start offset of current node
556  Integer nEndOffset = 0; // end offset of current node
557  String nContent = ""; // content of current node
558  Node currentNode = nIter.nextNode();
559  int nodeCounter = 0;
560  String path = "";
561  while (currentNode != null && fragCount > 0) {
562  // while there is next node and there is something to convert
563  nContent = currentNode.getNodeValue();
564  int nContentL = nContent.length();
565  nEndOffset = nStartOffset + nContentL;
566  path = XPathHelper.XPathStringOfNode(currentNode);
567 
568  for (int i = 0; i < numOfAnnots; i++) {
569  // for all sets of fragments
570  ArrayList<SuggestionFragment> aFragments = comFrCopy.get(i);
571  for (Iterator<SuggestionFragment> frIt = aFragments.iterator(); frIt.hasNext();) {
572  // for all fragments for one annotation
573  SuggestionFragment fr = frIt.next();
574  if (fr.getIsGood() == false) {
575  notConverted.get(i).add(fr); // fragment can not be converted
576  frIt.remove(); // fragment was processed
577  fragCount--;
578  continue;
579  }
580  if (fr.getPath().equals(path)) { // if fragment is in the node
581  SuggestionFragment partFrag = partialFragments.get(i);
582  int frEndOffset;
583  if (fr.getOffset() != null) {
584  frEndOffset = fr.getOffset() + fr.getLength();
585  }
586  else {
587  frEndOffset = fr.getLength();
588  }
589  if (frEndOffset > nContentL) { // content of fragment is too long
590  notConverted.get(i).add(fr);
591  frIt.remove();
592  fragCount--;
593  continue;
594  } else if (fr.getAnnotatedText() != null && !nContent.substring(fr.getOffset(), frEndOffset).equals(fr.getAnnotatedText())) {
595  // fr.getAnnotatedText() returns null when linearizing suggestion fragments
596  // content is not matching
597  notConverted.get(i).add(fr);
598  frIt.remove();
599  fragCount--;
600  continue;
601  }
602  if (partFrag == null && frEndOffset < nContentL) {
603  // whole linearized fragment in node
604  if (fr.getOffset() == null) {
605  // When user requests fragment from the start of a node editor sends offset as null, which causes NullPointerException
606  fr.setOffset(0);
607  }
608  numsOfSpacesSO.set(i,nodeCounter); // set number of spaces before start offset for this fragment
609  numsOfSpacesL.set(i,0); // set number of spaces inside length for this fragment
610  SuggestionFragment linFragment = new SuggestionFragment("", nStartOffset + fr.getOffset(),
611  fr.getLength(), fr.getAnnotatedText(),
612  fr.getRefSuggestion());
613  frIt.remove(); // fragment was successfully processed
614  fragCount--;
615  retFragments.set(i, linFragment); // set result
616  } else if (partFrag == null && frEndOffset == nContentL) {
617 
618  // start of linearized fragment in node
619  if (fr.getOffset() == null) {
620  fr.setOffset(0);
621  }
622  numsOfSpacesSO.set(i,nodeCounter); // set number of spaces before start offset for this fragment
623  numsOfSpacesL.set(i,0); // set number of spaces inside length for this fragment
624  SuggestionFragment linFragment = new SuggestionFragment("", nStartOffset + fr.getOffset(),
625  fr.getLength(), fr.getAnnotatedText(),
626  fr.getRefSuggestion());
627 
628  partialFragments.set(i, linFragment);
629  frIt.remove(); // fragment was successfully processed
630  fragCount--;
631  } else if (partFrag != null && frEndOffset < nContentL) {
632  // end of linearized fragment in node
633  SuggestionFragment linFragment = partialFragments.get(i);
634  if (fr.getAnnotatedText() != null) {
635  linFragment.setAnnotatedText(linFragment.getAnnotatedText() + fr.getAnnotatedText());
636  }
637  linFragment.setLength(linFragment.getLength() + fr.getLength());
638  retFragments.set(i, linFragment);
639  partialFragments.set(i, null);
640  if (numsOfSpacesL.get(i) != null) { // add space after previous node
641  numsOfSpacesL.set(i,numsOfSpacesL.get(i) + 1);
642  }
643  frIt.remove();
644  fragCount--;
645  } else if (partFrag != null && frEndOffset == nContentL) {
646  // part of linearized fragment in node
647  SuggestionFragment linFragment = partialFragments.get(i);
648  if (fr.getAnnotatedText() != null) {
649  linFragment.setAnnotatedText(linFragment.getAnnotatedText() + fr.getAnnotatedText());
650  }
651  linFragment.setLength(linFragment.getLength() + fr.getLength());
652  if (numsOfSpacesL.get(i) != null) { // add space after previous node
653  numsOfSpacesL.set(i,numsOfSpacesL.get(i) + 1);
654  }
655  frIt.remove();
656  fragCount--;
657  } else {
658  // error state
659  notConverted.get(i).add(fr); // fragment can not be converted
660  frIt.remove(); // fragment was processed
661  fragCount--;
662  continue;
663  }
664  } // if fragment is in the node
665  } // for all fragments for one annotation
666  } // for all sets of fragments
667 
668 
669  nStartOffset += nContent.length(); // start offset of next node
670  currentNode = nIter.nextNode(); // move to next node of document
671  while (currentNode != null && (currentNode.getNodeValue() == null || currentNode.getNodeValue().replaceAll("[\\s\\u00A0]+$", "").contentEquals(""))) {
672  currentNode = nIter.nextNode(); // move to next node of document
673  }
674  nodeCounter++;
675  } // while there is next node and there is something to convert
676  } // if document element is presented
677 
678  for (int i = 0; i < numOfAnnots; i++) {
679  // for all sets of fragments
680  ArrayList<SuggestionFragment> aFragments = comFrCopy.get(i);
681  // move rest of fragments
682  for (Iterator<SuggestionFragment> aFrIt = aFragments.iterator(); aFrIt.hasNext();) {
683  SuggestionFragment fr = aFrIt.next();
684  notConverted.get(i).add(fr);
685  aFrIt.remove();
686  fragCount--;
687  }
688 
689  // finish processing of partial fragment
690  if (partialFragments.get(i) != null) {
691  retFragments.set(i, partialFragments.get(i)); // set result
692  }
693  } // for all sets of fragments
694 
695  if (addSpaces) { // if spaces should be added (offsets incremented)
696  for (int i = 0; i < numOfAnnots; i++) { // for each converted fragment
697  SuggestionFragment fr = retFragments.get(i);
698  if (fr == null) { // fragment was not converted successfully
699  continue;
700  }
701  // increment offset
702  Integer spaces = numsOfSpacesSO.get(i);
703  if (spaces == null) {
704  spaces = 0;
705  }
706  fr.setOffset(fr.getOffset() + spaces);
707  // increment length
708  spaces = numsOfSpacesL.get(i);
709  if (spaces == null) {
710  spaces = 0;
711  }
712  fr.setLength(fr.getLength() + spaces);
713  } // for each converted fragment
714  } // if spaces should be added (offsets incremented)
715 
716  return retFragments;
717  } // fragmentsToLinSugFragments()
718 
719  /**
720  * Convert list of common fragments to linearized fragments.
721  * For more common fragments one linearized fragment can be created.
722  *
723  * @param comFragments List of common fragments.
724  * @param doc Document in which fragments are
725  * @param notConverted List for fragments, which was not converted
726  * @param addSpaces If true, it will add space after each node (offset will be incremented by 1)
727  * @return Returns list of linearized fragments
728  */
729  public static ArrayList<Fragment> fragmentsToLinFragments(ArrayList<ArrayList<Fragment>> comFragments,
730  Document doc, ArrayList<ArrayList<Fragment>> notConverted,
731  boolean addSpaces) {
732 
733  // create copy of array from which fragments will be removed and count fragments
734  int fragCount = 0;
735  int numOfAnnots = comFragments.size();
736 
737  ArrayList<ArrayList<Fragment>> comFrCopy = new ArrayList<ArrayList<Fragment>>();
738  // array of linearized fragments that will be returned by this method
739  ArrayList<Fragment> retFragments = new ArrayList<Fragment>(numOfAnnots);
740  // auxiliary array containing fragments from previous document node
741  ArrayList<Fragment> partialFragments = new ArrayList<Fragment>(numOfAnnots);
742  // auxiliary array containing number of spaces to add before start offset
743  ArrayList<Integer> numsOfSpacesSO = new ArrayList<Integer>(numOfAnnots);
744  // auxiliary array containing number of spaces to add into length
745  ArrayList<Integer> numsOfSpacesL = new ArrayList<Integer>(numOfAnnots);
746 
747  for (int i = 0; i < numOfAnnots; i++) {
748  ArrayList<Fragment> cFr = comFragments.get(i);
749  ArrayList<Fragment> aFragments = new ArrayList<Fragment>();
750  notConverted.add(new ArrayList<Fragment>());
751  comFrCopy.add(aFragments);
752  for (Iterator<Fragment> aFrIt = cFr.iterator(); aFrIt.hasNext();) {
753  Fragment fr = aFrIt.next();
754  aFragments.add(fr);
755  fragCount++;
756  }
757  // these two arrays should be initialized here, because their size is
758  // equal to the number of annotations
759  retFragments.add(null);
760  partialFragments.add(null);
761  numsOfSpacesSO.add(null);
762  numsOfSpacesL.add(null);
763  }
764 
765  if (doc == null) {
766  return null;
767  }
768  if (doc.getDocumentElement() == null) {
769  return null;
770  } else { // if document element is presented
771  NodeIterator nIter = ((DocumentTraversal) doc).createNodeIterator(doc.getDocumentElement(),
772  NodeFilter.SHOW_TEXT + NodeFilter.SHOW_CDATA_SECTION, null, true);
773  Integer nStartOffset = 0; // start offset of current node
774  Integer nEndOffset = 0; // end offset of current node
775  String nContent = ""; // content of current node
776  Node currentNode = nIter.nextNode();
777  int nodeCounter = 0;
778  String path = "";
779  while (currentNode != null && fragCount > 0) {
780  // while there is next node and there is something to convert
781  nContent = currentNode.getNodeValue();
782  int nContentL = nContent.length();
783  nEndOffset = nStartOffset + nContentL;
784  path = XPathHelper.XPathStringOfNode(currentNode);
785 
786  for (int i = 0; i < numOfAnnots; i++) {
787  // for all sets of fragments
788  ArrayList<Fragment> aFragments = comFrCopy.get(i);
789  for (Iterator<Fragment> frIt = aFragments.iterator(); frIt.hasNext();) {
790  // for all fragments for one annotation
791  Fragment fr = frIt.next();
792  if (fr.getIsGood() == false) {
793  notConverted.get(i).add(fr); // fragment can not be converted
794  frIt.remove(); // fragment was processed
795  fragCount--;
796  continue;
797  }
798  if (fr.getPath().equals(path)) { // if fragment is in the node
799  Fragment partFrag = partialFragments.get(i);
800  int frEndOffset;
801  if (fr.getOffset() != null) {
802  frEndOffset = fr.getOffset() + fr.getLength();
803  }
804  else {
805  frEndOffset = fr.getLength();
806  }
807  if (frEndOffset > nContentL) { // content of fragment is too long
808  notConverted.get(i).add(fr);
809  frIt.remove();
810  fragCount--;
811  continue;
812  } else if (fr.getAnnotatedText() != null && !nContent.substring(fr.getOffset(), frEndOffset).equals(fr.getAnnotatedText())) {
813  // fr.getAnnotatedText() returns null when linearizing suggestion fragments
814  // content is not matching
815  notConverted.get(i).add(fr);
816  frIt.remove();
817  fragCount--;
818  continue;
819  }
820  if (partFrag == null && frEndOffset < nContentL) {
821  // whole linearized fragment in node
822  if (fr.getOffset() == null) {
823  // When user requests fragment from the start of a node editor sends offset as null, which causes NullPointerException
824  fr.setOffset(0);
825  }
826  numsOfSpacesSO.set(i,nodeCounter); // set number of spaces before start offset for this fragment
827  numsOfSpacesL.set(i,0); // set number of spaces inside length for this fragment
828  Fragment linFragment = new Fragment("", nStartOffset + fr.getOffset(),
829  fr.getLength(), fr.getAnnotatedText(),
830  fr.getRefAnnotation());
831  frIt.remove(); // fragment was successfully processed
832  fragCount--;
833  retFragments.set(i, linFragment); // set result
834  } else if (partFrag == null && frEndOffset == nContentL) {
835 
836  // start of linearized fragment in node
837  if (fr.getOffset() == null) {
838  fr.setOffset(0);
839  }
840  numsOfSpacesSO.set(i,nodeCounter); // set number of spaces before start offset for this fragment
841  numsOfSpacesL.set(i,0); // set number of spaces inside length for this fragment
842  Fragment linFragment = new Fragment("", nStartOffset + fr.getOffset(),
843  fr.getLength(), fr.getAnnotatedText(),
844  fr.getRefAnnotation());
845 
846  partialFragments.set(i, linFragment);
847  frIt.remove(); // fragment was successfully processed
848  fragCount--;
849  } else if (partFrag != null && frEndOffset < nContentL) {
850  // end of linearized fragment in node
851  Fragment linFragment = partialFragments.get(i);
852  if (fr.getAnnotatedText() != null) {
853  linFragment.setAnnotatedText(linFragment.getAnnotatedText() + fr.getAnnotatedText());
854  }
855  linFragment.setLength(linFragment.getLength() + fr.getLength());
856  retFragments.set(i, linFragment);
857  partialFragments.set(i, null);
858  if (numsOfSpacesL.get(i) != null) { // add space after previous node
859  numsOfSpacesL.set(i,numsOfSpacesL.get(i) + 1);
860  }
861  frIt.remove();
862  fragCount--;
863  } else if (partFrag != null && frEndOffset == nContentL) {
864  // part of linearized fragment in node
865  Fragment linFragment = partialFragments.get(i);
866  if (fr.getAnnotatedText() != null) {
867  linFragment.setAnnotatedText(linFragment.getAnnotatedText() + fr.getAnnotatedText());
868  }
869  linFragment.setLength(linFragment.getLength() + fr.getLength());
870  if (numsOfSpacesL.get(i) != null) { // add space after previous node
871  numsOfSpacesL.set(i,numsOfSpacesL.get(i) + 1);
872  }
873  frIt.remove();
874  fragCount--;
875  } else {
876  // error state
877  notConverted.get(i).add(fr); // fragment can not be converted
878  frIt.remove(); // fragment was processed
879  fragCount--;
880  continue;
881  }
882  } // if fragment is in the node
883  } // for all fragments for one annotation
884  } // for all sets of fragments
885 
886 
887  nStartOffset += nContent.length(); // start offset of next node
888  currentNode = nIter.nextNode(); // move to next node of document
889  while (currentNode != null && (currentNode.getNodeValue() == null || currentNode.getNodeValue().replaceAll("[\\s\\u00A0]+$", "").contentEquals(""))) {
890  currentNode = nIter.nextNode(); // move to next node of document
891  }
892  nodeCounter++;
893  } // while there is next node and there is something to convert
894  } // if document element is presented
895 
896  for (int i = 0; i < numOfAnnots; i++) {
897  // for all sets of fragments
898  ArrayList<Fragment> aFragments = comFrCopy.get(i);
899  // move rest of fragments
900  for (Iterator<Fragment> aFrIt = aFragments.iterator(); aFrIt.hasNext();) {
901  Fragment fr = aFrIt.next();
902  notConverted.get(i).add(fr);
903  aFrIt.remove();
904  fragCount--;
905  }
906 
907  // finish processing of partial fragment
908  if (partialFragments.get(i) != null) {
909  retFragments.set(i, partialFragments.get(i)); // set result
910  }
911  } // for all sets of fragments
912 
913  if (addSpaces) { // if spaces should be added (offsets incremented)
914  for (int i = 0; i < numOfAnnots; i++) { // for each converted fragment
915  Fragment fr = retFragments.get(i);
916  if (fr == null) { // fragment was not converted successfully
917  continue;
918  }
919  // increment offset
920  Integer spaces = numsOfSpacesSO.get(i);
921  if (spaces == null) {
922  spaces = 0;
923  }
924  fr.setOffset(fr.getOffset() + spaces);
925  // increment length
926  spaces = numsOfSpacesL.get(i);
927  if (spaces == null) {
928  spaces = 0;
929  }
930  fr.setLength(fr.getLength() + spaces);
931  } // for each converted fragment
932  } // if spaces should be added (offsets incremented)
933 
934  return retFragments;
935  } // fragmentsToLinFragments()
936 
937  /**
938  * Convert list of common fragments to linearized fragments.
939  * For more common fragments one linearized fragment can be created.
940  *
941  * @param comFragments List of common fragments.
942  * @param doc Document in which fragments are
943  * @param notConverted List for fragments, which was not converted
944  * @param addSpaces If true, it will add space after each node (offset will be incremented by 1)
945  * @return Returns list of linearized fragments
946  */
947  public static ArrayList<AlternativeFragment> fragmentsToLinAltFragments(ArrayList<ArrayList<AlternativeFragment>> comFragments,
948  Document doc, ArrayList<ArrayList<AlternativeFragment>> notConverted,
949  boolean addSpaces) {
950 
951  // create copy of array from which fragments will be removed and count fragments
952  int fragCount = 0;
953  int numOfAnnots = comFragments.size();
954 
955  ArrayList<ArrayList<AlternativeFragment>> comFrCopy = new ArrayList<ArrayList<AlternativeFragment>>();
956  // array of linearized fragments that will be returned by this method
957  ArrayList<AlternativeFragment> retFragments = new ArrayList<AlternativeFragment>(numOfAnnots);
958  // auxiliary array containing fragments from previous document node
959  ArrayList<AlternativeFragment> partialFragments = new ArrayList<AlternativeFragment>(numOfAnnots);
960  // auxiliary array containing number of spaces to add before start offset
961  ArrayList<Integer> numsOfSpacesSO = new ArrayList<Integer>(numOfAnnots);
962  // auxiliary array containing number of spaces to add into length
963  ArrayList<Integer> numsOfSpacesL = new ArrayList<Integer>(numOfAnnots);
964 
965  for (int i = 0; i < numOfAnnots; i++) {
966  ArrayList<AlternativeFragment> cFr = comFragments.get(i);
967  ArrayList<AlternativeFragment> aFragments = new ArrayList<AlternativeFragment>();
968  notConverted.add(new ArrayList<AlternativeFragment>());
969  comFrCopy.add(aFragments);
970  for (Iterator<AlternativeFragment> aFrIt = cFr.iterator(); aFrIt.hasNext();) {
971  AlternativeFragment fr = aFrIt.next();
972  aFragments.add(fr);
973  fragCount++;
974  }
975  // these two arrays should be initialized here, because their size is
976  // equal to the number of annotations
977  retFragments.add(null);
978  partialFragments.add(null);
979  numsOfSpacesSO.add(null);
980  numsOfSpacesL.add(null);
981  }
982 
983  if (doc == null) {
984  return null;
985  }
986  if (doc.getDocumentElement() == null) {
987  return null;
988  } else { // if document element is presented
989  NodeIterator nIter = ((DocumentTraversal) doc).createNodeIterator(doc.getDocumentElement(),
990  NodeFilter.SHOW_TEXT + NodeFilter.SHOW_CDATA_SECTION, null, true);
991  Integer nStartOffset = 0; // start offset of current node
992  Integer nEndOffset = 0; // end offset of current node
993  String nContent = ""; // content of current node
994  Node currentNode = nIter.nextNode();
995  int nodeCounter = 0;
996  String path = "";
997  while (currentNode != null && fragCount > 0) {
998  // while there is next node and there is something to convert
999  nContent = currentNode.getNodeValue();
1000  int nContentL = nContent.length();
1001  nEndOffset = nStartOffset + nContentL;
1002  path = XPathHelper.XPathStringOfNode(currentNode);
1003 
1004  for (int i = 0; i < numOfAnnots; i++) {
1005  // for all sets of fragments
1006  ArrayList<AlternativeFragment> aFragments = comFrCopy.get(i);
1007  for (Iterator<AlternativeFragment> frIt = aFragments.iterator(); frIt.hasNext();) {
1008  // for all fragments for one annotation
1009  AlternativeFragment fr = frIt.next();
1010  if (fr.getIsGood() == false) {
1011  notConverted.get(i).add(fr); // fragment can not be converted
1012  frIt.remove(); // fragment was processed
1013  fragCount--;
1014  continue;
1015  }
1016  if (fr.getPath().equals(path)) { // if fragment is in the node
1017  AlternativeFragment partFrag = partialFragments.get(i);
1018  int frEndOffset;
1019  if (fr.getOffset() != null) {
1020  frEndOffset = fr.getOffset() + fr.getLength();
1021  }
1022  else {
1023  frEndOffset = fr.getLength();
1024  }
1025  if (frEndOffset > nContentL) { // content of fragment is too long
1026  notConverted.get(i).add(fr);
1027  frIt.remove();
1028  fragCount--;
1029  continue;
1030  } else if (fr.getAnnotatedText() != null && !nContent.substring(fr.getOffset(), frEndOffset).equals(fr.getAnnotatedText())) {
1031  // fr.getAnnotatedText() returns null when linearizing suggestion fragments
1032  // content is not matching
1033  notConverted.get(i).add(fr);
1034  frIt.remove();
1035  fragCount--;
1036  continue;
1037  }
1038  if (partFrag == null && frEndOffset < nContentL) {
1039  // whole linearized fragment in node
1040  if (fr.getOffset() == null) {
1041  // When user requests fragment from the start of a node editor sends offset as null, which causes NullPointerException
1042  fr.setOffset(0);
1043  }
1044  numsOfSpacesSO.set(i,nodeCounter); // set number of spaces before start offset for this fragment
1045  numsOfSpacesL.set(i,0); // set number of spaces inside length for this fragment
1046  AlternativeFragment linFragment = new AlternativeFragment("", nStartOffset + fr.getOffset(),
1047  fr.getLength(), fr.getAnnotatedText(),
1048  fr.getRefAlternative());
1049  frIt.remove(); // fragment was successfully processed
1050  fragCount--;
1051  retFragments.set(i, linFragment); // set result
1052  } else if (partFrag == null && frEndOffset == nContentL) {
1053 
1054  // start of linearized fragment in node
1055  if (fr.getOffset() == null) {
1056  fr.setOffset(0);
1057  }
1058  numsOfSpacesSO.set(i,nodeCounter); // set number of spaces before start offset for this fragment
1059  numsOfSpacesL.set(i,0); // set number of spaces inside length for this fragment
1060  AlternativeFragment linFragment = new AlternativeFragment("", nStartOffset + fr.getOffset(),
1061  fr.getLength(), fr.getAnnotatedText(),
1062  fr.getRefAlternative());
1063 
1064  partialFragments.set(i, linFragment);
1065  frIt.remove(); // fragment was successfully processed
1066  fragCount--;
1067  } else if (partFrag != null && frEndOffset < nContentL) {
1068  // end of linearized fragment in node
1069  AlternativeFragment linFragment = partialFragments.get(i);
1070  if (fr.getAnnotatedText() != null) {
1071  linFragment.setAnnotatedText(linFragment.getAnnotatedText() + fr.getAnnotatedText());
1072  }
1073  linFragment.setLength(linFragment.getLength() + fr.getLength());
1074  retFragments.set(i, linFragment);
1075  partialFragments.set(i, null);
1076  if (numsOfSpacesL.get(i) != null) { // add space after previous node
1077  numsOfSpacesL.set(i,numsOfSpacesL.get(i) + 1);
1078  }
1079  frIt.remove();
1080  fragCount--;
1081  } else if (partFrag != null && frEndOffset == nContentL) {
1082  // part of linearized fragment in node
1083  AlternativeFragment linFragment = partialFragments.get(i);
1084  if (fr.getAnnotatedText() != null) {
1085  linFragment.setAnnotatedText(linFragment.getAnnotatedText() + fr.getAnnotatedText());
1086  }
1087  linFragment.setLength(linFragment.getLength() + fr.getLength());
1088  if (numsOfSpacesL.get(i) != null) { // add space after previous node
1089  numsOfSpacesL.set(i,numsOfSpacesL.get(i) + 1);
1090  }
1091  frIt.remove();
1092  fragCount--;
1093  } else {
1094  // error state
1095  notConverted.get(i).add(fr); // fragment can not be converted
1096  frIt.remove(); // fragment was processed
1097  fragCount--;
1098  continue;
1099  }
1100  } // if fragment is in the node
1101  } // for all fragments for one annotation
1102  } // for all sets of fragments
1103 
1104 
1105  nStartOffset += nContent.length(); // start offset of next node
1106  currentNode = nIter.nextNode(); // move to next node of document
1107  while (currentNode != null && (currentNode.getNodeValue() == null || currentNode.getNodeValue().replaceAll("[\\s\\u00A0]+$", "").contentEquals(""))) {
1108  currentNode = nIter.nextNode(); // move to next node of document
1109  }
1110  nodeCounter++;
1111  } // while there is next node and there is something to convert
1112  } // if document element is presented
1113 
1114  for (int i = 0; i < numOfAnnots; i++) {
1115  // for all sets of fragments
1116  ArrayList<AlternativeFragment> aFragments = comFrCopy.get(i);
1117  // move rest of fragments
1118  for (Iterator<AlternativeFragment> aFrIt = aFragments.iterator(); aFrIt.hasNext();) {
1119  AlternativeFragment fr = aFrIt.next();
1120  notConverted.get(i).add(fr);
1121  aFrIt.remove();
1122  fragCount--;
1123  }
1124 
1125  // finish processing of partial fragment
1126  if (partialFragments.get(i) != null) {
1127  retFragments.set(i, partialFragments.get(i)); // set result
1128  }
1129  } // for all sets of fragments
1130 
1131  if (addSpaces) { // if spaces should be added (offsets incremented)
1132  for (int i = 0; i < numOfAnnots; i++) { // for each converted fragment
1133  AlternativeFragment fr = retFragments.get(i);
1134  if (fr == null) { // fragment was not converted successfully
1135  continue;
1136  }
1137  // increment offset
1138  Integer spaces = numsOfSpacesSO.get(i);
1139  if (spaces == null) {
1140  spaces = 0;
1141  }
1142  fr.setOffset(fr.getOffset() + spaces);
1143  // increment length
1144  spaces = numsOfSpacesL.get(i);
1145  if (spaces == null) {
1146  spaces = 0;
1147  }
1148  fr.setLength(fr.getLength() + spaces);
1149  } // for each converted fragment
1150  } // if spaces should be added (offsets incremented)
1151 
1152  return retFragments;
1153  } // fragmentsToLinAltFragments()
1154 
1155 
1156  /**
1157  * Linearizes fragment of document to string
1158  *
1159  * @param docFr Fragment of document to linearize
1160  * @param doc Whole document with nodes
1161  * @return Returns linearized fragment of document in string
1162  */
1163  public static String linearizeDocumentFragment(DocumentFragment docFr, Document doc) {
1164  StringBuilder linFr = new StringBuilder();
1165  NodeList nodeL = docFr.getChildNodes();
1166  int nodeCount = nodeL.getLength();
1167  for (int i = 0; i < nodeCount; i++) { // for each node
1168  Node curRootNode = nodeL.item(i);
1169  NodeIterator nIter = ((DocumentTraversal) doc).createNodeIterator(curRootNode,
1170  NodeFilter.SHOW_TEXT + NodeFilter.SHOW_CDATA_SECTION, null, true);
1171  Node currentNode = nIter.nextNode();
1172  boolean compensateNewline = false;
1173  while (currentNode != null) {
1174  short nodeType = currentNode.getNodeType();
1175  if (currentNode.getNodeValue() == null) {
1176  currentNode = nIter.nextNode(); // move to next node of document
1177  continue;
1178  }
1179  if (compensateNewline) {
1180  linFr.append(" ");
1181  }
1182  linFr.append(currentNode.getNodeValue());
1183  if (!compensateNewline) {
1184  compensateNewline = true;
1185  }
1186  currentNode = nIter.nextNode(); // move to next node of document
1187  }
1188  } // for each node
1189  return linFr.toString();
1190  } // linearizeDocumentFragment()
1191 
1192  /**
1193  * Convert list of text modifications to linearized modifications.
1194  * For more common modifications one linearized modification can be created.
1195  *
1196  * @param comModifications List of common modifications.
1197  * @param doc Document in which modifications are
1198  * @param notConverted Modifications, which was not converted
1199  * @return Returns list of linearized modifications
1200  */
1201  public static ArrayList<TextModification> modificationsToLinMod(ArrayList<TextModification> comModifications,
1202  Document doc, ArrayList<TextModification> notConverted)
1203  throws ParserConfigurationException, SAXException,
1204  IOException {
1205  if (matcherProvider == null) { // if matcher provider not exists yet, create it
1207  }
1208 
1209  // create copy of array from which modifications will be removed and count modifications
1210  int numOfMods = comModifications.size();
1211  ArrayList<TextModification> comModCopy = new ArrayList<TextModification>(numOfMods);
1212  ArrayList<TextModification> notConvertedFlag = new ArrayList<TextModification>(numOfMods);
1213  notConverted = new ArrayList<TextModification>();
1214 
1215  for (int i = 0; i < numOfMods; i++) {
1216  TextModification cM = comModifications.get(i);
1217  comModCopy.add(cM);
1218  notConvertedFlag.add(cM);
1219  }
1220 
1221  ArrayList<TextModification> retModifications = new ArrayList<TextModification>();
1222 
1223  if (doc == null) {
1224  return null;
1225  }
1226  if (doc.getDocumentElement() == null) {
1227  return null;
1228  } else { // if document element is present
1229  NodeIterator nIter = ((DocumentTraversal) doc).createNodeIterator(doc.getDocumentElement(),
1230  NodeFilter.SHOW_TEXT + NodeFilter.SHOW_CDATA_SECTION, null, true);
1231  Integer nStartOffset = 0; // start offset of current node
1232  Integer nEndOffset = 0; // end offset of current node
1233  String nContent = ""; // content of current node
1234  Node currentNode = nIter.nextNode();
1235 
1236  // ============== adding text to the empty document ==============
1237  if (currentNode == null) {
1238  int nodeCounter = 0;
1239  int modifStart = 0;
1240  for (Iterator<TextModification> tmIt = comModCopy.iterator(); tmIt.hasNext();) {
1241  // for all modifications
1242  TextModification tm = tmIt.next();
1243 
1244  // replace the line break tag with a space
1245  tm.setNewContent(tm.getNewContent().replace("<br>"," "));
1246  // remove all common HTML tags so the content is linearized
1247  tm.setNewContent(tm.getNewContent().replaceAll("<[^>\\s]*>",""));
1248  tm.setLength(tm.getNewContent().length());
1249 
1250  TextModification lTM = new TextModification(null, modifStart,
1251  modifStart + tm.getLength(), tm.getNewContent());
1252  modifStart += tm.getLength();
1253  retModifications.add(lTM);
1254  notConvertedFlag.set(nodeCounter, null); // modification successfully converted
1255  numOfMods--;
1256  nodeCounter += 1;
1257  }
1258  }
1259 
1260  // ============== modification of the non-empty document ==============
1261  else {
1262  String path = "";
1263  int modifCounter; // index to the wasConverted flag array
1264  boolean compensateNewline = false;
1265  while (currentNode != null && numOfMods > 0) {
1266  // while there is next node and there is something to convert
1267  if ( compensateNewline ) {
1268  // add a space character to prevent word joining at the end and at the beginning
1269  nContent = currentNode.getNodeValue().concat(" ");
1270  }
1271  else {
1272  nContent = currentNode.getNodeValue();
1273  }
1274  int nContentL = nContent.length();
1275  nEndOffset = nStartOffset + nContentL;
1276  path = XPathHelper.XPathStringOfNode(currentNode);
1277 
1278  modifCounter = 0;
1279  for (Iterator<TextModification> tmIt = comModCopy.iterator(); tmIt.hasNext();) {
1280  // for all modifications
1281  TextModification tm = tmIt.next();
1282 
1283  if (tm.getPath() == null || tm.getPath().isEmpty()) {
1284  // linearized modification - no linearization needed
1285  retModifications.add(tm);
1286  notConvertedFlag.set(modifCounter, null); // modification successfully converted
1287  numOfMods--;
1288  continue;
1289  }
1290 
1291  if (tm.getOffset() == null && tm.getLength() == null && tm.getPath().equals("/HTML[1]/BODY[1]")) {
1292  // modification of the whole body
1293  // create iterator and initialize auxiliary variables
1294  NodeIterator wholeDocEraseIt = ((DocumentTraversal) doc).createNodeIterator(doc.getDocumentElement(),
1295  NodeFilter.SHOW_TEXT + NodeFilter.SHOW_CDATA_SECTION, null, true);
1296  Node n = wholeDocEraseIt.nextNode();
1297  boolean newlineCompens = false;
1298  int docLength = 0;
1299  String nodeCont = null;
1300 
1301  // determine the length of the old version of the document
1302  while (n != null) {
1303  nodeCont = n.getNodeValue();
1304  if (nodeCont == null) {
1305  n = wholeDocEraseIt.nextNode();
1306  continue;
1307  }
1308  docLength += nodeCont.length();
1309  if (newlineCompens) {
1310  docLength += 1;
1311  }
1312  if (!newlineCompens) {
1313  newlineCompens = true;
1314  }
1315  n = wholeDocEraseIt.nextNode();
1316  }
1317 
1318  String newContent = "";
1319  // if the new content doesn't represent empty text data set,
1320  // remove all HTML tags
1321  if (tm.getNewContent() != null && !tm.getNewContent().equals("<body></body>")) {
1322  newContent = tm.getNewContent();
1323  // replace the line break tag with a space
1324  newContent = newContent.replace("<br>"," ");
1325  // remove all common HTML tags so the content is linearized
1326  newContent = newContent.replaceAll("<[^>\\s]*>","");
1327  }
1328 
1329  // assemble the linearized version of the modification
1330  TextModification lTM = new TextModification(null, 0, docLength, newContent);
1331  retModifications.add(lTM);
1332  notConvertedFlag.set(modifCounter, null); // modification successfully converted
1333  numOfMods--;
1334  continue;
1335  }
1336 
1337  if (path.startsWith(tm.getPath())) { // if modification is in the node
1338  if (tm.getOffset() == null) {
1339  // if whole content of the node will be replaced
1340  String linContent = "";
1341  if (tm.getNewContent() != null) { // whole node content has been removed
1343  }
1344 
1345  // perform the linearization process and assemble the linearized
1346  // modification data structure
1347  TextModification lTM = null;
1348  if (tm.getOffset() == null) {
1349  lTM = new TextModification(null, nStartOffset,
1350  nEndOffset - nStartOffset, linContent);
1351  }
1352  else {
1353  lTM = new TextModification(null, nStartOffset + tm.getOffset(),
1354  nEndOffset - nStartOffset, linContent);
1355  }
1356 
1357  retModifications.add(lTM);
1358  notConvertedFlag.set(modifCounter, null); // modification successfully converted
1359  numOfMods--;
1360  continue;
1361  }
1362 
1363  // ------ modified fragment is too long -------
1364  if (tm.getOffset() != null && tm.getLength() != null) {
1365  int tmEndOffset = tm.getOffset() + tm.getLength();
1366 
1367  if (tmEndOffset > nContentL) { // modified fragment is too long
1368  notConverted.add(tm);
1369  notConvertedFlag.set(modifCounter, null);;
1370  numOfMods--;
1371 
1372  continue;
1373  }
1374  }
1375  } // modification is in the node
1376  } // for all modifications
1377  nStartOffset += nContent.length() + 1; // start offset of next node
1378  // +1 for newline compensation
1379  currentNode = nIter.nextNode(); // move to next node of document
1380  } // while there is next node and there is something to convert
1381  } // modification of the non-empty document
1382  } // if document element is presented
1383 
1384  // move rest of the modifications to the not converted list
1385  for (Iterator<TextModification> tmIt = notConvertedFlag.iterator(); tmIt.hasNext();) {
1386  TextModification tm = tmIt.next();
1387  if (tm != null) {
1388  notConverted.add(tm);
1389  }
1390  }
1391 
1392  return retModifications;
1393  } // modificationsToLinMod()
1394 
1395 } // public class Linearizer
Utility functions for document linearization.
Definition: Linearizer.java:40
static ArrayList< TextModification > modificationsToLinMod(ArrayList< TextModification > comModifications, Document doc, ArrayList< TextModification > notConverted)
static ArrayList< ArrayList< AlternativeFragment > > linAltFragmentsToFragments(ArrayList< AlternativeFragment > linFragments, Document doc, ArrayList< ArrayList< AlternativeFragment >> badFragments)
static String linearizeDocumentFragment(DocumentFragment docFr, Document doc)
Class providing access to available matchers.
static String linearizeDocument(Document doc)
Class representing modification of annotated document text.
static ArrayList< ArrayList< SuggestionFragment > > linSugFragmentsToFragments(ArrayList< SuggestionFragment > linFragments, Document doc, ArrayList< ArrayList< SuggestionFragment >> badFragments)
static ArrayList< Fragment > fragmentsToLinFragments(ArrayList< ArrayList< Fragment >> comFragments, Document doc, ArrayList< ArrayList< Fragment >> notConverted, boolean addSpaces)
static ArrayList< ArrayList< Fragment > > linFragmentsToFragments(ArrayList< Fragment > linFragments, Document doc, ArrayList< ArrayList< Fragment >> badFragments)
Definition: Linearizer.java:57
Class representing annotated fragment.
Definition: Fragment.java:48
static ArrayList< SuggestionFragment > fragmentsToLinSugFragments(ArrayList< ArrayList< SuggestionFragment >> comFragments, Document doc, ArrayList< ArrayList< SuggestionFragment >> notConverted, boolean addSpaces)
static ArrayList< AlternativeFragment > fragmentsToLinAltFragments(ArrayList< ArrayList< AlternativeFragment >> comFragments, Document doc, ArrayList< ArrayList< AlternativeFragment >> notConverted, boolean addSpaces)