Rewritten Xml2XmlParser

Now it is push parser + utility function for file parsing.
pull/25/merge
Tatsuhiro Tsujikawa 2012-07-11 23:20:48 +09:00
parent cd67e27ca4
commit 70685bd233
4 changed files with 136 additions and 75 deletions

View File

@ -2,7 +2,7 @@
/* /*
* aria2 - The high speed download utility * aria2 - The high speed download utility
* *
* Copyright (C) 2011 Tatsuhiro Tsujikawa * Copyright (C) 2012 Tatsuhiro Tsujikawa
* *
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
@ -36,28 +36,17 @@
#include <cassert> #include <cassert>
#include <cstring> #include <cstring>
#include <deque>
#include <libxml/parser.h>
#include "a2io.h" #include "a2io.h"
#include "BinaryStream.h"
#include "ParserStateMachine.h" #include "ParserStateMachine.h"
#include "A2STR.h" #include "A2STR.h"
#include "a2functional.h" #include "a2functional.h"
#include "XmlAttr.h" #include "XmlAttr.h"
#include "util.h"
namespace aria2 { namespace aria2 {
namespace { namespace xml {
struct SessionData {
std::deque<std::string> charactersStack_;
ParserStateMachine* psm_;
SessionData(ParserStateMachine* psm)
: psm_(psm)
{}
};
} // namespace
namespace { namespace {
void mlStartElement void mlStartElement
@ -88,13 +77,13 @@ void mlStartElement
xmlAttr.valueLength = pattrs[i+4]-xmlAttr.value; xmlAttr.valueLength = pattrs[i+4]-xmlAttr.value;
xmlAttrs.push_back(xmlAttr); xmlAttrs.push_back(xmlAttr);
} }
sd->psm_->beginElement sd->psm->beginElement
(reinterpret_cast<const char*>(localname), (reinterpret_cast<const char*>(localname),
reinterpret_cast<const char*>(prefix), reinterpret_cast<const char*>(prefix),
reinterpret_cast<const char*>(nsUri), reinterpret_cast<const char*>(nsUri),
xmlAttrs); xmlAttrs);
if(sd->psm_->needsCharactersBuffering()) { if(sd->psm->needsCharactersBuffering()) {
sd->charactersStack_.push_front(A2STR::NIL); sd->charactersStack.push_front(A2STR::NIL);
} }
} }
} // namespace } // namespace
@ -108,11 +97,11 @@ void mlEndElement
{ {
SessionData* sd = reinterpret_cast<SessionData*>(userData); SessionData* sd = reinterpret_cast<SessionData*>(userData);
std::string characters; std::string characters;
if(sd->psm_->needsCharactersBuffering()) { if(sd->psm->needsCharactersBuffering()) {
characters = sd->charactersStack_.front(); characters = sd->charactersStack.front();
sd->charactersStack_.pop_front(); sd->charactersStack.pop_front();
} }
sd->psm_->endElement sd->psm->endElement
(reinterpret_cast<const char*>(localname), (reinterpret_cast<const char*>(localname),
reinterpret_cast<const char*>(prefix), reinterpret_cast<const char*>(prefix),
reinterpret_cast<const char*>(nsUri), reinterpret_cast<const char*>(nsUri),
@ -124,8 +113,8 @@ namespace {
void mlCharacters(void* userData, const xmlChar* ch, int len) void mlCharacters(void* userData, const xmlChar* ch, int len)
{ {
SessionData* sd = reinterpret_cast<SessionData*>(userData); SessionData* sd = reinterpret_cast<SessionData*>(userData);
if(sd->psm_->needsCharactersBuffering()) { if(sd->psm->needsCharactersBuffering()) {
sd->charactersStack_.front().append(&ch[0], &ch[len]); sd->charactersStack.front().append(&ch[0], &ch[len]);
} }
} }
} // namespace } // namespace
@ -169,61 +158,85 @@ xmlSAXHandler mySAXHandler =
} // namespace } // namespace
XmlParser::XmlParser(ParserStateMachine* psm) XmlParser::XmlParser(ParserStateMachine* psm)
: psm_(psm) : psm_(psm),
sessionData_(psm),
ctx_(xmlCreatePushParserCtxt(&mySAXHandler, &sessionData_, 0, 0, 0)),
lastError_(0)
{} {}
XmlParser::~XmlParser() {} XmlParser::~XmlParser()
bool XmlParser::parseFile(const char* filename)
{ {
SessionData sessionData(psm_); xmlFreeParserCtxt(ctx_);
// Old libxml2(at least 2.7.6, Ubuntu 10.04LTS) does not read stdin
// when "/dev/stdin" is passed as filename while 2.7.7 does. So we
// convert DEV_STDIN to "-" for compatibility.
const char* nfilename;
if(strcmp(filename, DEV_STDIN) == 0) {
nfilename = "-";
} else {
nfilename = filename;
}
int r = xmlSAXUserParseFile(&mySAXHandler, &sessionData, nfilename);
return r == 0 && psm_->finished();
} }
bool XmlParser::parseBinaryStream(BinaryStream* bs) ssize_t XmlParser::parseUpdate(const char* data, size_t size)
{ {
const size_t bufSize = 4096; if(lastError_ != 0) {
unsigned char buf[bufSize]; return lastError_;
ssize_t res = bs->readData(buf, 4, 0);
if(res != 4) {
return false;
} }
SessionData sessionData(psm_); int rv = xmlParseChunk(ctx_, data, size, 0);
xmlParserCtxtPtr ctx = xmlCreatePushParserCtxt if(rv != 0) {
(&mySAXHandler, &sessionData, return lastError_ = ERR_XML_PARSE;
reinterpret_cast<const char*>(buf), res, 0); } else {
auto_delete<xmlParserCtxtPtr> deleter(ctx, xmlFreeParserCtxt); return size;
off_t readOffset = res; }
while(1) { }
ssize_t res = bs->readData(buf, bufSize, readOffset);
if(res == 0) { ssize_t XmlParser::parseFinal(const char* data, size_t size)
break; {
} if(lastError_ != 0) {
if(xmlParseChunk(ctx, reinterpret_cast<const char*>(buf), res, 0) != 0) { return lastError_;
// TODO we need this? Just break is not suffice? }
int rv = xmlParseChunk(ctx_, data, size, 1);
if(rv != 0) {
return lastError_ = ERR_XML_PARSE;
} else {
return size;
}
}
int XmlParser::reset()
{
// TODO psm must be reset
sessionData_.reset();
int rv = xmlCtxtResetPush(ctx_, 0, 0, 0, 0);
if(rv != 0) {
return lastError_ = ERR_RESET;
} else {
return 0;
}
}
bool parseFile(const std::string& filename, ParserStateMachine* psm)
{
int fd;
if(filename == DEV_STDIN) {
fd = STDIN_FILENO;
} else {
while((fd = a2open(utf8ToWChar(filename).c_str(),
O_BINARY | O_RDONLY, OPEN_MODE)) == -1 && fd != EINTR);
if(fd == -1) {
return false; return false;
} }
readOffset += res;
} }
xmlParseChunk(ctx, reinterpret_cast<const char*>(buf), 0, 1); XmlParser ps(psm);
return psm_->finished(); char buf[4096];
ssize_t nread;
bool retval = true;
while((nread = read(fd, buf, sizeof(buf))) > 0) {
if(ps.parseUpdate(buf, nread) < 0) {
retval = false;
break;
}
}
if(nread == 0 && retval) {
if(ps.parseFinal(0, 0) < 0) {
retval = false;
}
}
return retval;
} }
bool XmlParser::parseMemory(const char* xml, size_t len) } // namespace xml
{
SessionData sessionData(psm_);
int r = xmlSAXUserParseMemory(&mySAXHandler, &sessionData, xml, len);
return r == 0 && psm_->finished();
}
} // namespace aria2 } // namespace aria2

View File

@ -2,7 +2,7 @@
/* /*
* aria2 - The high speed download utility * aria2 - The high speed download utility
* *
* Copyright (C) 2011 Tatsuhiro Tsujikawa * Copyright (C) 2012 Tatsuhiro Tsujikawa
* *
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
@ -37,25 +37,56 @@
#include "common.h" #include "common.h"
#include <sys/types.h>
#include <cstdlib> #include <cstdlib>
#include <string>
#include <deque>
#include <libxml/parser.h>
namespace aria2 { namespace aria2 {
class BinaryStream;
class ParserStateMachine; class ParserStateMachine;
namespace xml {
enum XmlError {
ERR_XML_PARSE = -1,
ERR_RESET = -2
};
struct SessionData {
std::deque<std::string> charactersStack;
ParserStateMachine* psm;
SessionData(ParserStateMachine* psm)
: psm(psm)
{}
void reset()
{
charactersStack.clear();
}
};
class XmlParser { class XmlParser {
public: public:
// This object does not delete psm. // This object does not delete psm.
XmlParser(ParserStateMachine* psm); XmlParser(ParserStateMachine* psm);
~XmlParser(); ~XmlParser();
bool parseFile(const char* filename); ssize_t parseUpdate(const char* data, size_t size);
bool parseBinaryStream(BinaryStream* binaryStream); ssize_t parseFinal(const char* data, size_t size);
bool parseMemory(const char* xml, size_t size); int reset();
private: private:
ParserStateMachine* psm_; ParserStateMachine* psm_;
SessionData sessionData_;
xmlParserCtxtPtr ctx_;
int lastError_;
}; };
bool parseFile(const std::string& filename, ParserStateMachine* psm);
} // namespace xml
} // namespace aria2 } // namespace aria2
#endif // D_XML2_XML_PARSER_H #endif // D_XML2_XML_PARSER_H

View File

@ -125,7 +125,7 @@ SharedHandle<Metalinker> parseFile
{ {
MetalinkParserStateMachine psm; MetalinkParserStateMachine psm;
psm.setBaseUri(baseUri); psm.setBaseUri(baseUri);
if(!XmlParser(&psm).parseFile(filename.c_str())) { if(!xml::parseFile(filename, &psm)) {
throw DL_ABORT_EX2("Could not parse Metalink XML document.", throw DL_ABORT_EX2("Could not parse Metalink XML document.",
error_code::METALINK_PARSE_ERROR); error_code::METALINK_PARSE_ERROR);
} }
@ -142,7 +142,24 @@ SharedHandle<Metalinker> parseBinaryStream
{ {
MetalinkParserStateMachine psm; MetalinkParserStateMachine psm;
psm.setBaseUri(baseUri); psm.setBaseUri(baseUri);
if(!XmlParser(&psm).parseBinaryStream(bs)) { xml::XmlParser ps(&psm);
unsigned char buf[4096];
ssize_t nread;
off_t offread = 0;
bool retval = true;
while((nread = bs->readData(buf, sizeof(buf), offread)) > 0) {
if(ps.parseUpdate(reinterpret_cast<const char*>(buf), nread) < 0) {
retval = false;
break;
}
offread += nread;
}
if(nread == 0 && retval) {
if(ps.parseFinal(0, 0) < 0) {
retval = false;
}
}
if(!retval) {
throw DL_ABORT_EX2("Could not parse Metalink XML document.", throw DL_ABORT_EX2("Could not parse Metalink XML document.",
error_code::METALINK_PARSE_ERROR); error_code::METALINK_PARSE_ERROR);
} }

View File

@ -53,7 +53,7 @@ namespace rpc {
RpcRequest xmlParseMemory(const char* xml, size_t size) RpcRequest xmlParseMemory(const char* xml, size_t size)
{ {
XmlRpcRequestParserStateMachine psm; XmlRpcRequestParserStateMachine psm;
if(!XmlParser(&psm).parseMemory(xml, size)) { if(xml::XmlParser(&psm).parseFinal(xml, size) < 0) {
throw DL_ABORT_EX(MSG_CANNOT_PARSE_XML_RPC_REQUEST); throw DL_ABORT_EX(MSG_CANNOT_PARSE_XML_RPC_REQUEST);
} }
SharedHandle<List> params; SharedHandle<List> params;