I have an XML structure like this:
<?xml version="1.0" encoding="utf-8"?>
<videos>
<video>
<id>47288</id>
<thumbs>
<thumb><![CDATA[http://foo.com/bar.jpg]]></thumb>
</thumbs>
<link><![CDATA[http://foo.com/bar.html]]></link>
<title><![CDATA[Sample Title Here]]></title>
<categories>
<category><![CDATA[Cat1]]></category>
<category><![CDATA[Cat2]]></category>
</categories>
<tags>
<tag><![CDATA[Tag1]]></tag>
<tag><![CDATA[Tag2]]></tag>
<tag><![CDATA[Tag3]]></tag>
<tag><![CDATA[Tag4]]></tag>
<tag><![CDATA[Tag5]]></tag>
<tag><![CDATA[Tag6]]></tag>
</tags>
<duration><![CDATA[9:57]]></duration>
<pubDate><![CDATA[2013-12-17]]></pubDate>
</video>
// insert 200,000 more <video> entries here
No idea why this is all written as CDATA but there's not much I can do about it, it's the data I've been given. My code to read this massive (1.5gb) XML file is to stream it using fs to sax then to saxpath, like so:
var saxpath = require('saxpath')
var fs = require('fs')
var sax = require('sax')
var parseString = require('xml2js').parseString;
var util = require('util');
var saxParser = sax.createStream(true)
var streamer = new saxpath.SaXPath(saxParser, '/videos/video')
streamer.on('match', function(xml) {
console.log(xml);
parseString(xml, function (err, result) {
var json1 = JSON.stringify(result);
var json = JSON.parse(json1);
console.log(util.inspect(json, false, null));
});
});
fs.createReadStream('./xml/big_data_file.xml').pipe(saxParser)
However, when I get to the console.log(xml), it shows this:
<video>
<id>620339</id>
<thumbs>
<thumb></thumb>
</thumbs>
<link></link>
<title></title>
<categories>
<category></category>
<category></category>
</categories>
<tags>
<tag></tag>
<tag></tag>
<tag></tag>
<tag></tag>
<tag></tag>
<tag></tag>
<tag></tag>
</tags>
<duration></duration>
<pubDate></pubDate>
</video>
No data inside whatsoever. There's no mention of CDATA in the Saxpath Docs, although I'm not sure if this is an issue with Saxpath or Sax itself.
Any ideas how I can remedy this?
Cheers!