4A Server -  2.0
 All Classes Namespaces Files Functions Variables Enumerator
MatcherProvider.java
Go to the documentation of this file.
1 /*
2  * Project: Server for annotations sharing
3  * Subproject: Position of fragment of text in XML document
4  * Authors: Peter Bartoš, Michael Angelov
5  * Edited by: Ing. Jaroslav Dytrych idytrych@fit.vutbr.cz,
6  * Bc. Lukas Kubik xkubik22@stud.fit.vutbr.cz
7  * File: MatcherProvider.java
8  * Description: Class providing access to available matchers
9  */
10 
11 /**
12  * @file MatcherProvider.java
13  *
14  * @brief Class providing access to available matchers
15  */
16 
17 package cz.vutbr.fit.knot.annotations.fragmentUpdater;
18 
20 import java.io.IOException;
21 import java.io.StringReader;
22 import java.util.ArrayList;
23 import java.util.Collections;
24 import java.util.Iterator;
25 import javax.xml.parsers.ParserConfigurationException;
26 import javax.xml.xpath.XPathExpressionException;
27 import org.apache.html.dom.HTMLDocumentImpl;
28 import org.cyberneko.html.parsers.DOMFragmentParser;
29 import org.cyberneko.html.parsers.DOMParser;
30 import org.w3c.dom.Document;
31 import org.w3c.dom.DocumentFragment;
32 import org.w3c.dom.Node;
33 import org.w3c.dom.NodeList;
34 import org.w3c.dom.html.HTMLDocument;
35 import org.xml.sax.InputSource;
36 import org.xml.sax.SAXException;
37 
38 /**
39  * Class providing access to available matchers
40  *
41  * @brief Class providing access to available matchers
42  * @author Peter Bartoš, Michael Angelov
43  */
44 public class MatcherProvider {
45 
46  public ArrayList<MatcherWrapper> matcherWrappers;
47  protected static final String _SET_NAMESPACES = "http://xml.org/sax/features/namespaces";
48  protected static final String _REPORT_ERRORS = "http://cyberneko.org/html/features/report-errors";
49 
50  /**
51  * Constructor
52  *
53  */
54  public MatcherProvider() {
55  matcherWrappers = new ArrayList<MatcherWrapper>();
56  }
57 
58  /**
59  * Method for adding matcher to list of provided matchers
60  *
61  * @param matcher matcher to be added
62  * @param priority priority of matcher
63  */
64  public void add(Matcher matcher, int priority) {
65  matcherWrappers.add(new MatcherWrapper(matcher, priority));
66  sortMatchers();
67  }
68 
69  /**
70  * Method for adding matcher wrapper
71  *
72  * @param mw matcher wrapper to be added
73  */
74  public void add(MatcherWrapper mw) {
75  matcherWrappers.add(mw);
76  sortMatchers();
77  }
78 
79  /**
80  * Method for removing matcher wrapper from list
81  *
82  * @param mw matcher to be removed
83  */
84  public void remove(MatcherWrapper mw) {
85  matcherWrappers.remove(mw);
86  sortMatchers();
87  }
88 
89  /**
90  * Method for removing matcher from provided matcher list
91  *
92  * @param matcher matcher to be removed
93  */
94  public void removeMatcher(Matcher matcher) {
95  for (MatcherWrapper mw : matcherWrappers) {
96  if (mw.matcher == matcher) {
97  matcherWrappers.remove(mw);
98  break;
99  }
100  }
101 
102  // sort matcher in the end
103  sortMatchers();
104  }
105 
106  /**
107  * Getter for list of provided matchers
108  *
109  * @return list of provided matchers
110  */
111  public ArrayList<MatcherWrapper> getMatcherWrappers() {
112  return matcherWrappers;
113  }
114 
115  /**
116  * Method for matching query fragment in document
117  *
118  * @param text text in which fragment should be matched
119  * @param queryFragment fragment to be matched
120  * @return result fragment that was matched in document
121  */
122  public UpdatableFragment match(String text, UpdatableFragment queryFragment) throws ParserConfigurationException, SAXException, IOException, XPathExpressionException {
123  Document document = getDocumentFromString(text, false, true);
124  for (MatcherWrapper mw : matcherWrappers) {
125  UpdatableFragment resultFragment = mw.matcher.match(document, queryFragment);
126  if (resultFragment != null) {
127  return resultFragment;
128  }
129  }
130 
131  return null;
132  }
133 
134  /**
135  * Method for matching query fragment in document
136  *
137  * @param document Document in which fragment should be matched
138  * @param queryFragment fragment to be matched
139  * @return result fragment that was matched in document
140  */
141  public UpdatableFragment match(Document document, UpdatableFragment queryFragment) throws XPathExpressionException {
142  for (MatcherWrapper mw : matcherWrappers) {
143  UpdatableFragment resultFragment = mw.matcher.match(document, queryFragment);
144  if (resultFragment != null) {
145  return resultFragment;
146  }
147  }
148 
149  return null;
150  }
151 
152  /**
153  * Method for matching query fragment in document
154  *
155  * @param document Document in which fragment should be matched
156  * @param queryFragment fragment to be matched
157  * @return result fragments that was matched in document
158  */
159  public ArrayList<UpdatableFragment> matchAll(Document document, UpdatableFragment queryFragment) throws XPathExpressionException {
160  ArrayList<UpdatableFragment> allResultFragments = new ArrayList<UpdatableFragment>();
161  for (MatcherWrapper mw : matcherWrappers) { // for all wrappers
162  ArrayList<UpdatableFragment> resultFragments = mw.matcher.matchAll(document, queryFragment);
163  // Add all fragments which are not included
164  for (Iterator<UpdatableFragment> rFIt = resultFragments.iterator(); rFIt.hasNext();) {
165  UpdatableFragment rF = rFIt.next();
166  if (!allResultFragments.contains(rF)) {
167  allResultFragments.add(rF);
168  }
169  }
170  } // for all wrappers
171 
172  return allResultFragments;
173  } // matchAll()
174 
175  /**
176  * Method for matching query fragment in document
177  *
178  * @param document Document in which fragment should be matched
179  * @param queryFragment fragment to be matched
180  * @return result fragments that was matched in document
181  */
182  public ArrayList<UpdatableFragment> matchAllIncrementally(Document document, UpdatableFragment queryFragment) throws XPathExpressionException {
183  ArrayList<UpdatableFragment> allResultFragments = new ArrayList<UpdatableFragment>();
184  for (MatcherWrapper mw : matcherWrappers) { // for all wrappers
185  ArrayList<UpdatableFragment> resultFragments = mw.matcher.matchAll(document, queryFragment);
186 
187  // Are there any matches?
188  if(!resultFragments.isEmpty()){
189  // Add all fragments within one sensitivity
190  for (Iterator<UpdatableFragment> rFIt = resultFragments.iterator(); rFIt.hasNext();) {
191  UpdatableFragment rF = rFIt.next();
192  allResultFragments.add(rF);
193  }
194  // return results within one sensitivity
195  return allResultFragments;
196  } // isEmpty()
197 
198  } // for all wrappers, increasing sensitivity
199 
200  return allResultFragments;
201  } // matchAllIncrementally()
202 
203  /**
204  * Method for matching query fragment in document
205  * This method should be used only with bidirectional iterator!
206  * Bidirectional iterator has a it's own mechanism for traversing the nodes.
207  *
208  * @param document Document in which fragment should be matched
209  * @param queryFragment fragment to be matched
210  * @return result fragments that was matched in document
211  */
212  public UpdatableFragment matchInClosestNode(Document document, UpdatableFragment queryFragment) throws XPathExpressionException {
213  UpdatableFragment resultFragment = null;
214  boolean endIteration = false;
215 
216  for (MatcherWrapper mw : matcherWrappers) { // initialize all wrappers and interators!
217  mw.matcher.InitIterator(document, queryFragment);
218  }
219 
220  while(!endIteration){
221  endIteration = true;
222 
223  for (MatcherWrapper mw : matcherWrappers) { // for all wrappers
224  // If there is another node for match
225  if(mw.matcher.canMatchNext()){
226  // Matching in next node (traversing method is encapsulated withing the iterator)
227  resultFragment = mw.matcher.matchInNextNode(queryFragment);
228 
229  // Are there any matches?
230  if(resultFragment != null){
231  // return results within one sensitivity
232  return resultFragment;
233  }
234  endIteration = false;
235  }
236  } // for all wrappers
237  }
238 
239  return resultFragment;
240  } // matchInClosestNode()
241 
242  /**
243  * Method for matching query fragment in document
244  * This method should be used only with bidirectional iterator!
245  * Bidirectional iterator has a it's own mechanism for traversing the nodes.
246  *
247  * @param document Document in which fragment should be matched
248  * @param queryFragment fragment to be matched
249  * @return result fragments that was matched in document
250  */
251  public ArrayList<UpdatableFragment> matchIn3ClosestNodes(Document document, UpdatableFragment queryFragment) throws XPathExpressionException {
252  ArrayList<UpdatableFragment> resultFragments = new ArrayList<UpdatableFragment>();
253 
254  for (MatcherWrapper mw : matcherWrappers) { // initialize all wrappers and interators!
255  mw.matcher.InitIterator(document, queryFragment);
256  }
257 
258  // If we don't start on the first node then we have to iterate through 3 nodes
259  int itrerateNodes = 3;
260 
261  Node firstNode = document.getDocumentElement();
262  firstNode = getFirstTextNode(firstNode);
263 
264  NodeList nodeList = XPathHelper.evaluateXPath(document, queryFragment.getXPathExpression());
265 
266  //If the first node is the same as the original fragment node then we iterate through 2 nodes
267  if(nodeList != null && nodeList.getLength() > 0 && nodeList.item(0).isEqualNode(firstNode)){
268  itrerateNodes = 2;
269  }
270 
271  // for all wrappers try to find match in original node and it's siblings
272  for (MatcherWrapper mw : matcherWrappers) {
273  // Match in closest siblings
274  for(int i = 0; i < itrerateNodes; i++){
275  if(mw.matcher.canMatchNext()){
276  // Matching in next node (traversing method is encapsulated withing the iterator)
277  UpdatableFragment uf = mw.matcher.matchInNextNode(queryFragment);
278  // Are there any matches?
279  if(uf != null){
280  resultFragments.add(uf);
281  }
282  }
283  else{
284  break;
285  }
286  }
287  } // for all wrappers
288 
289  // If we didn't match in closest siblings on all sensitivities then we will try
290  // to continue in next siblings.
291  // Code after condition is more or less code of the matchInClosestNode method.
292  if(resultFragments.isEmpty()){
293  UpdatableFragment resultFragment = null;
294  boolean endIteration = false;
295 
296  while(!endIteration && resultFragments.isEmpty()){
297  endIteration = true;
298 
299  for (MatcherWrapper mw : matcherWrappers) { // for all wrappers
300  // If there is another node for match
301  if(mw.matcher.canMatchNext()){
302  // Matching in next node (traversing method is encapsulated withing the iterator)
303  resultFragment = mw.matcher.matchInNextNode(queryFragment);
304 
305  // Are there any matches?
306  if(resultFragment != null){
307  // return results within one sensitivity
308  resultFragments.add(resultFragment);
309  }
310  endIteration = false;
311  }
312  } // for all wrappers
313  }
314  }
315 
316  if(!resultFragments.isEmpty()){
317  UpdatableFragment finalFragment = selectBestSubFragment(queryFragment, resultFragments);
318  resultFragments = new ArrayList<UpdatableFragment>();
319  resultFragments.add(finalFragment);
320  }
321  return resultFragments;
322  } // matchIn3ClosestNodes()
323 
324  /**
325  * Method finds the first text node of a document
326  * @param firstNode Document first node
327  * @return Document first text node
328  */
329  private Node getFirstTextNode(Node firstNode){
330  while (firstNode.hasChildNodes()) {
331  firstNode = firstNode.getFirstChild();
332  }
333  while (firstNode.getNextSibling() != null){
334  firstNode = firstNode.getNextSibling();
335  }
336  while (firstNode.hasChildNodes()) {
337  firstNode = firstNode.getFirstChild();
338  }
339  return firstNode;
340  }
341 
342  /**
343  * Method for sorting matcher according to their priority
344  *
345  */
346  public void sortMatchers() {
347  Collections.sort(matcherWrappers, new java.util.Comparator<MatcherWrapper>(){
348  @Override
349  public int compare(MatcherWrapper o1, MatcherWrapper o2) {
350  return o1.getPriority() - o2.getPriority();
351  }
352  });
353 
354  for (int i = 0; i < matcherWrappers.size(); i++) {
355  matcherWrappers.get(i).setPriority(i);
356  }
357  }
358 
359  /**
360  * Method for removing all matchers
361  *
362  */
363  public void removeAll() {
364  matcherWrappers.clear();
365  }
366 
367  /**
368  * Converts given string into Document
369  *
370  * @param text String to convert
371  * @param setNamespaces Flag for setting namespaces to parser
372  * @param setErrors Flag for setting errors report to parser
373  * @return Document
374  */
375  public Document getDocumentFromString(String text, boolean setNamespaces, boolean setErrors) throws ParserConfigurationException, SAXException, IOException {
376  InputSource is = new InputSource();
377  is.setCharacterStream(new StringReader(text));
378 
379  DOMParser parser = new DOMParser();
380  if (setNamespaces) {
381  parser.setFeature(_SET_NAMESPACES, false);
382  }
383  if (setErrors) {
384  parser.setFeature(_REPORT_ERRORS, false);
385  }
386  parser.parse(is);
387  Document document = parser.getDocument();
388 
389  return document;
390  } // getDocumentFromString()
391 
392  /**
393  * Converts given string into Document fragment
394  *
395  * @param text String to convert
396  * @return Document fragment
397  */
398  public DocumentFragment getFragmentFromString(String text) throws ParserConfigurationException, SAXException, IOException {
399  InputSource is = new InputSource();
400  is.setCharacterStream(new StringReader(text));
401 
402  DOMFragmentParser parser = new DOMFragmentParser();
403  parser.setFeature("http://xml.org/sax/features/namespaces", false);
404  HTMLDocument document = new HTMLDocumentImpl();
405  DocumentFragment fragment = document.createDocumentFragment();
406  parser.parse(is, fragment);
407 
408  return fragment;
409  } // getFragmentFromString()
410 
411  /**
412  * Selects the best fragment from the best substrings
413  *
414  * @param origFragment Original fragment
415  * @param BestSubParts Best substrings
416  * @return New selected fragment
417  * @throws XPathExpressionException
418  */
420  ArrayList<UpdatableFragment> BestSubParts)
421  throws XPathExpressionException{
422  if(BestSubParts.isEmpty()){
423  return null;
424  }
425  else if(BestSubParts.size() == 1){
426  return BestSubParts.get(0);
427  }
428 
429  int minBadness = Integer.MAX_VALUE; //treshold
430 
431  String originalText = origFragment.getText(); // original fragment text
432  int orignalOffset = origFragment.getOffset(); // original fragment offset
433  String originalPath = origFragment.getXPathString(); // original fragment XPath
434 
435  Iterator<UpdatableFragment> itFrg = BestSubParts.iterator(); // best subfragments iterator
436 
437  UpdatableFragment comparedfragment; // currently compared fragment
438 
439  int index = 0; // current index
440  int bestIndex = 0; // index of the best fragment text
441 
442  // finds the best substring
443  while(itFrg.hasNext()){
444 
445  comparedfragment = itFrg.next();
446 
447  int LD = Util.levenshtein(originalText, comparedfragment.getText());
448  double LPerc = (LD / (double)originalText.length()) * 100;
449 
450  int badness = 355 * Util.levenshtein(originalPath, comparedfragment.getXPathString()) +
451  5 * (int)LPerc +
452  Math.abs(orignalOffset - comparedfragment.getOffset());
453  if(badness < minBadness){
454  minBadness = badness;
455  bestIndex = index;
456  }
457  ++index;
458  }
459 
460  return BestSubParts.get(bestIndex);
461  }
462 
463 } // class MatcherProvider
464 
ArrayList< UpdatableFragment > matchAll(Document document, UpdatableFragment queryFragment)
ArrayList< UpdatableFragment > matchAllIncrementally(Document document, UpdatableFragment queryFragment)
UpdatableFragment matchInClosestNode(Document document, UpdatableFragment queryFragment)
Class providing access to available matchers.
ArrayList< UpdatableFragment > matchIn3ClosestNodes(Document document, UpdatableFragment queryFragment)
UpdatableFragment selectBestSubFragment(UpdatableFragment origFragment, ArrayList< UpdatableFragment > BestSubParts)
Class for matcher consisting of comparator and node iterator.
Definition: Matcher.java:32
UpdatableFragment match(String text, UpdatableFragment queryFragment)
UpdatableFragment match(Document document, UpdatableFragment queryFragment)
Utility class (manipulates RFC 3339 dates)
Definition: Util.java:29
Document getDocumentFromString(String text, boolean setNamespaces, boolean setErrors)