17 package cz.vutbr.fit.knot.annotations.fragmentUpdater;
20 import java.io.IOException;
21 import java.io.StringReader;
22 import java.util.ArrayList;
23 import java.util.Collections;
24 import java.util.Iterator;
25 import javax.xml.parsers.ParserConfigurationException;
26 import javax.xml.xpath.XPathExpressionException;
27 import org.apache.html.dom.HTMLDocumentImpl;
28 import org.cyberneko.html.parsers.DOMFragmentParser;
29 import org.cyberneko.html.parsers.DOMParser;
30 import org.w3c.dom.Document;
31 import org.w3c.dom.DocumentFragment;
32 import org.w3c.dom.Node;
33 import org.w3c.dom.NodeList;
34 import org.w3c.dom.html.HTMLDocument;
35 import org.xml.sax.InputSource;
36 import org.xml.sax.SAXException;
47 protected static final String
_SET_NAMESPACES =
"http://xml.org/sax/features/namespaces";
48 protected static final String
_REPORT_ERRORS =
"http://cyberneko.org/html/features/report-errors";
75 matcherWrappers.add(mw);
85 matcherWrappers.remove(mw);
96 if (mw.matcher == matcher) {
97 matcherWrappers.remove(mw);
126 if (resultFragment != null) {
127 return resultFragment;
144 if (resultFragment != null) {
145 return resultFragment;
160 ArrayList<UpdatableFragment> allResultFragments =
new ArrayList<UpdatableFragment>();
162 ArrayList<UpdatableFragment> resultFragments = mw.matcher.matchAll(document, queryFragment);
164 for (Iterator<UpdatableFragment> rFIt = resultFragments.iterator(); rFIt.hasNext();) {
166 if (!allResultFragments.contains(rF)) {
167 allResultFragments.add(rF);
172 return allResultFragments;
183 ArrayList<UpdatableFragment> allResultFragments =
new ArrayList<UpdatableFragment>();
185 ArrayList<UpdatableFragment> resultFragments = mw.matcher.matchAll(document, queryFragment);
188 if(!resultFragments.isEmpty()){
190 for (Iterator<UpdatableFragment> rFIt = resultFragments.iterator(); rFIt.hasNext();) {
192 allResultFragments.add(rF);
195 return allResultFragments;
200 return allResultFragments;
214 boolean endIteration =
false;
217 mw.matcher.InitIterator(document, queryFragment);
220 while(!endIteration){
225 if(mw.matcher.canMatchNext()){
227 resultFragment = mw.matcher.matchInNextNode(queryFragment);
230 if(resultFragment != null){
232 return resultFragment;
234 endIteration =
false;
239 return resultFragment;
252 ArrayList<UpdatableFragment> resultFragments =
new ArrayList<UpdatableFragment>();
255 mw.matcher.InitIterator(document, queryFragment);
259 int itrerateNodes = 3;
261 Node firstNode = document.getDocumentElement();
264 NodeList nodeList = XPathHelper.evaluateXPath(document, queryFragment.getXPathExpression());
267 if(nodeList != null && nodeList.getLength() > 0 && nodeList.item(0).isEqualNode(firstNode)){
274 for(
int i = 0; i < itrerateNodes; i++){
275 if(mw.matcher.canMatchNext()){
280 resultFragments.add(uf);
292 if(resultFragments.isEmpty()){
294 boolean endIteration =
false;
296 while(!endIteration && resultFragments.isEmpty()){
301 if(mw.matcher.canMatchNext()){
303 resultFragment = mw.matcher.matchInNextNode(queryFragment);
306 if(resultFragment != null){
308 resultFragments.add(resultFragment);
310 endIteration =
false;
316 if(!resultFragments.isEmpty()){
318 resultFragments =
new ArrayList<UpdatableFragment>();
319 resultFragments.add(finalFragment);
321 return resultFragments;
330 while (firstNode.hasChildNodes()) {
331 firstNode = firstNode.getFirstChild();
333 while (firstNode.getNextSibling() != null){
334 firstNode = firstNode.getNextSibling();
336 while (firstNode.hasChildNodes()) {
337 firstNode = firstNode.getFirstChild();
354 for (
int i = 0; i < matcherWrappers.size(); i++) {
355 matcherWrappers.get(i).setPriority(i);
364 matcherWrappers.clear();
375 public Document
getDocumentFromString(String text,
boolean setNamespaces,
boolean setErrors)
throws ParserConfigurationException, SAXException, IOException {
376 InputSource is =
new InputSource();
377 is.setCharacterStream(
new StringReader(text));
379 DOMParser parser =
new DOMParser();
387 Document document = parser.getDocument();
398 public DocumentFragment
getFragmentFromString(String text)
throws ParserConfigurationException, SAXException, IOException {
399 InputSource is =
new InputSource();
400 is.setCharacterStream(
new StringReader(text));
402 DOMFragmentParser parser =
new DOMFragmentParser();
403 parser.setFeature(
"http://xml.org/sax/features/namespaces",
false);
404 HTMLDocument document =
new HTMLDocumentImpl();
405 DocumentFragment fragment = document.createDocumentFragment();
406 parser.parse(is, fragment);
420 ArrayList<UpdatableFragment> BestSubParts)
421 throws XPathExpressionException{
422 if(BestSubParts.isEmpty()){
425 else if(BestSubParts.size() == 1){
426 return BestSubParts.get(0);
429 int minBadness = Integer.MAX_VALUE;
431 String originalText = origFragment.getText();
432 int orignalOffset = origFragment.getOffset();
433 String originalPath = origFragment.getXPathString();
435 Iterator<UpdatableFragment> itFrg = BestSubParts.iterator();
443 while(itFrg.hasNext()){
445 comparedfragment = itFrg.next();
447 int LD = Util.levenshtein(originalText, comparedfragment.getText());
448 double LPerc = (LD / (double)originalText.length()) * 100;
450 int badness = 355 * Util.levenshtein(originalPath, comparedfragment.getXPathString()) +
452 Math.abs(orignalOffset - comparedfragment.getOffset());
453 if(badness < minBadness){
454 minBadness = badness;
460 return BestSubParts.get(bestIndex);
ArrayList< MatcherWrapper > matcherWrappers
ArrayList< UpdatableFragment > matchAll(Document document, UpdatableFragment queryFragment)
ArrayList< UpdatableFragment > matchAllIncrementally(Document document, UpdatableFragment queryFragment)
static final String _SET_NAMESPACES
UpdatableFragment matchInClosestNode(Document document, UpdatableFragment queryFragment)
static final String _REPORT_ERRORS
Class providing access to available matchers.
void add(Matcher matcher, int priority)
DocumentFragment getFragmentFromString(String text)
ArrayList< MatcherWrapper > getMatcherWrappers()
ArrayList< UpdatableFragment > matchIn3ClosestNodes(Document document, UpdatableFragment queryFragment)
void add(MatcherWrapper mw)
Class for XML document fragment.
UpdatableFragment selectBestSubFragment(UpdatableFragment origFragment, ArrayList< UpdatableFragment > BestSubParts)
Class for matcher consisting of comparator and node iterator.
Node getFirstTextNode(Node firstNode)
UpdatableFragment match(String text, UpdatableFragment queryFragment)
Wrapper class for matcher.
UpdatableFragment match(Document document, UpdatableFragment queryFragment)
void removeMatcher(Matcher matcher)
Utility class (manipulates RFC 3339 dates)
Document getDocumentFromString(String text, boolean setNamespaces, boolean setErrors)