Advertisement
rplantiko

XML parsing with vtd-xml

Aug 15th, 2014
660
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Java 4.76 KB | None | 0 0
  1. /* Using vtd-xml ( http://vtd-xml.sourceforge.net/ ) to parse a huge XML containing all my tweets.
  2.    
  3.    I had collected all my tweets in a word document, supplying it from time to time with new slices
  4.    that I had copied manually from the twitter app "All My Tweets".
  5.    
  6.    Unfortunately, the links got partially lost during this copy process. Also, the word document grew to an enormous
  7.    size of 2.97 MB / 717 pages of tweets.
  8.    
  9.    So I had the desire to extract all the links and data into a pure text format.
  10.    Second step will be the consolidation of the links: Since the twitter status links for the single message still work,
  11.    I am planning to check and get the correct links with the 'curl' tool, and correct the texts if necessary. Also, the
  12.    current status of the link will be marked (available / n.a.)
  13.    
  14.    Saving the word document as an XML file, gave me an XML doc of 44 MB. So I was looking for the best parser to handle
  15.    an XML document of such an amount.
  16.    
  17.    I found vtd-xml - a very satisfactory tool. In this first class, I use a combination of both methods:
  18.    - Selecting nodes with XPath expressions - very comfortable :-)
  19.    - Using DOM navigation commands FIRST_CHILD / NEXT_SIBLING to access the desired elements.
  20.    
  21.    I find this parser very recommendable, particularly for scanning huge XML files. The performance is very good.
  22.    
  23.    Future enhancement could be:
  24.    A utility class expecting one or more XPath expressions and the data to be looked at,
  25.    so no new build would be necessary for each new XPath expression.
  26.    If output is in a common format - like JSON, such a tool could be used in a command chain as well.    
  27.    
  28.    */
  29.  
  30. import com.ximpleware.*;
  31. import java.util.*;
  32. import static com.ximpleware.VTDNav.*;
  33.  
  34. public class ParseTweets {
  35.  
  36.   private static VTDGen vg = new VTDGen();
  37.   private static VTDNav vn;
  38.   private static Hashtable<String,String> links = new Hashtable<String,String>();      
  39.  
  40.   public static void main(String[] args) throws ParseException, NavException, XPathParseException, XPathEvalException{
  41.  
  42.     AutoPilot ap = new AutoPilot();
  43.     ap.declareXPathNameSpace("pkg","http://schemas.microsoft.com/office/2006/xmlPackage");
  44.     ap.declareXPathNameSpace("w","http://schemas.openxmlformats.org/wordprocessingml/2006/main");
  45.     ap.declareXPathNameSpace("r","http://schemas.openxmlformats.org/officeDocument/2006/relationships");
  46.     ap.declareXPathNameSpace("a","http://schemas.openxmlformats.org/drawingml/2006/main");
  47.  
  48.     if (vg.parseFile("C:\\Temp\\tweets.xml",true)){
  49.  
  50.         vn = vg.getNav();                
  51.         ap.bind(vn);
  52.        
  53.         ap.selectXPath("/pkg:package/pkg:part[@pkg:name='/word/_rels/document.xml.rels']/pkg:xmlData/Relationships/Relationship");
  54.         while(ap.evalXPath()!=-1){
  55.             int iId = vn.getAttrVal("Id"),
  56.                 iTarget = vn.getAttrVal("Target");
  57.             links.put(vn.toString(iId),vn.toString(iTarget));
  58.         }
  59.        
  60.         ap.selectXPath("/pkg:package/pkg:part[@pkg:name='/word/document.xml']/pkg:xmlData/w:document/w:body//w:p");
  61.         while(ap.evalXPath()!=-1){
  62.           vn.push();
  63.           System.out.println( getSingleTweet( ) );
  64.           vn.pop();
  65.         }
  66.                
  67.     }
  68.   }
  69.  
  70.   private static String getSingleTweet(  ) throws NavException {
  71.     StringBuilder tweet = new StringBuilder("");
  72.     if (vn.toElement(FIRST_CHILD)) {
  73.       do {
  74.          if (vn.matchElement("w:r")) {
  75.            vn.push();
  76.            if (vn.toElement(FIRST_CHILD)) do {
  77.              if (vn.matchElement("w:t")) {
  78.                int k = vn.getText( );
  79.                tweet.append( vn.toString( k ) );
  80.              }
  81.              else if (vn.matchElement("w:drawing")) {
  82.                vn.push();            
  83.                if (vn.toElement(FIRST_CHILD,"wp:inline") &&
  84.                    vn.toElement(FIRST_CHILD,"wp:extent") &&
  85.                    vn.toElement(NEXT_SIBLING,"wp:effectExtent") &&
  86.                    vn.toElement(NEXT_SIBLING,"wp:docPr") &&
  87.                    vn.toElement(FIRST_CHILD,"a:hlinkClick")) {
  88.                  
  89.                  int l = vn.getAttrVal("r:id");  
  90.                  tweet.append( '\n' );
  91.                  tweet.append( links.get( vn.toString(l) ) );
  92.                  
  93.                  }
  94.                vn.pop();
  95.              }
  96.            } while (vn.toElement(NEXT_SIBLING));
  97.            vn.pop();  
  98.          }
  99.          else if (vn.matchElement("w:hyperlink")) {
  100.            int iId = vn.getAttrVal("r:id");
  101.            if (iId > -1) {
  102.              tweet.append(" ");
  103.              tweet.append( links.get( vn.toString( iId ) ) );
  104.              tweet.append(" ");
  105.              }
  106.          }
  107.       } while (vn.toElement(NEXT_SIBLING));
  108.     }
  109.     return tweet.toString();
  110.   }
  111.  
  112. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement