XML parsing with vtd-xml

/* Using vtd-xml ( http://vtd-xml.sourceforge.net/ ) to parse a huge XML containing all my tweets.

   I had collected all my tweets in a word document, supplying it from time to time with new slices
   that I had copied manually from the twitter app "All My Tweets".

   Unfortunately, the links got partially lost during this copy process. Also, the word document grew to an enormous
   size of 2.97 MB / 717 pages of tweets.

   So I had the desire to extract all the links and data into a pure text format.
   Second step will be the consolidation of the links: Since the twitter status links for the single message still work,
   I am planning to check and get the correct links with the 'curl' tool, and correct the texts if necessary. Also, the
   current status of the link will be marked (available / n.a.)

   Saving the word document as an XML file, gave me an XML doc of 44 MB. So I was looking for the best parser to handle
   an XML document of such an amount.

   I found vtd-xml - a very satisfactory tool. In this first class, I use a combination of both methods:
   - Selecting nodes with XPath expressions - very comfortable :-)
   - Using DOM navigation commands FIRST_CHILD / NEXT_SIBLING to access the desired elements.

   I find this parser very recommendable, particularly for scanning huge XML files. The performance is very good.

   Future enhancement could be:
   A utility class expecting one or more XPath expressions and the data to be looked at,
   so no new build would be necessary for each new XPath expression.
   If output is in a common format - like JSON, such a tool could be used in a command chain as well.

   */

import com.ximpleware.*;
import java.util.*;
import static com.ximpleware.VTDNav.*;

public class ParseTweets {

  private static VTDGen vg = new VTDGen();
  private static VTDNav vn;
  private static Hashtable<String,String> links = new Hashtable<String,String>();

  public static void main(String[] args) throws ParseException, NavException, XPathParseException, XPathEvalException{

    AutoPilot ap = new AutoPilot();
    ap.declareXPathNameSpace("pkg","http://schemas.microsoft.com/office/2006/xmlPackage");
    ap.declareXPathNameSpace("w","http://schemas.openxmlformats.org/wordprocessingml/2006/main");
    ap.declareXPathNameSpace("r","http://schemas.openxmlformats.org/officeDocument/2006/relationships");
    ap.declareXPathNameSpace("a","http://schemas.openxmlformats.org/drawingml/2006/main");

    if (vg.parseFile("C:\\Temp\\tweets.xml",true)){

        vn = vg.getNav();
        ap.bind(vn);

        ap.selectXPath("/pkg:package/pkg:part[@pkg:name='/word/_rels/document.xml.rels']/pkg:xmlData/Relationships/Relationship");
        while(ap.evalXPath()!=-1){
            int iId = vn.getAttrVal("Id"),
                iTarget = vn.getAttrVal("Target");
            links.put(vn.toString(iId),vn.toString(iTarget));
        }

        ap.selectXPath("/pkg:package/pkg:part[@pkg:name='/word/document.xml']/pkg:xmlData/w:document/w:body//w:p");
        while(ap.evalXPath()!=-1){
          vn.push();
          System.out.println( getSingleTweet( ) );
          vn.pop();
        }

    }
  }

  private static String getSingleTweet(  ) throws NavException {
    StringBuilder tweet = new StringBuilder("");
    if (vn.toElement(FIRST_CHILD)) {
      do {
         if (vn.matchElement("w:r")) {
           vn.push();
           if (vn.toElement(FIRST_CHILD)) do {
             if (vn.matchElement("w:t")) {
               int k = vn.getText( );
               tweet.append( vn.toString( k ) );
             }
             else if (vn.matchElement("w:drawing")) {
               vn.push();
               if (vn.toElement(FIRST_CHILD,"wp:inline") &&
                   vn.toElement(FIRST_CHILD,"wp:extent") &&
                   vn.toElement(NEXT_SIBLING,"wp:effectExtent") &&
                   vn.toElement(NEXT_SIBLING,"wp:docPr") &&
                   vn.toElement(FIRST_CHILD,"a:hlinkClick")) {

                 int l = vn.getAttrVal("r:id");
                 tweet.append( '\n' );
                 tweet.append( links.get( vn.toString(l) ) );

                 }
               vn.pop();
             }
           } while (vn.toElement(NEXT_SIBLING));
           vn.pop();
         }
         else if (vn.matchElement("w:hyperlink")) {
           int iId = vn.getAttrVal("r:id");
           if (iId > -1) {
             tweet.append(" ");
             tweet.append( links.get( vn.toString( iId ) ) );
             tweet.append(" ");
             }
         }
      } while (vn.toElement(NEXT_SIBLING));
    }
    return tweet.toString();
  }

}