diff --git a/doc/motivation.qbk b/doc/motivation.qbk index 08a8c2f..ddfa151 100644 --- a/doc/motivation.qbk +++ b/doc/motivation.qbk @@ -181,6 +181,195 @@ code and local data (['data]) while using asynchronous operations (['async_read()], ['async_write()]). The algorithm is implemented in one function and error handling is done by one try-catch block. +[heading recursive SAX parsing] +To someone who knows SAX, the phrase "recursive SAX parsing" might sound +nonsensical. You get callbacks from SAX; you have to manage the element stack +yourself. If you want recursive XML processing, you must first read the entire +DOM into memory, then walk the tree. + +But coroutines let you invert the flow of control so you can ask for SAX +events. Once you can do that, you can process them recursively. + + // Represent a subset of interesting SAX events + struct BaseEvent{ + BaseEvent(const BaseEvent&)=delete; + BaseEvent& operator=(const BaseEvent&)=delete; + }; + + // End of document or element + struct CloseEvent: public BaseEvent{ + // CloseEvent binds (without copying) the TagType reference. + CloseEvent(const xml::sax::Parser::TagType& name): + mName(name) + {} + + const xml::sax::Parser::TagType& mName; + }; + + // Start of document or element + struct OpenEvent: public CloseEvent{ + // In addition to CloseEvent's TagType, OpenEvent binds AttributeIterator. + OpenEvent(const xml::sax::Parser::TagType& name, + xml::sax::AttributeIterator& attrs): + CloseEvent(name), + mAttrs(attrs) + {} + + xml::sax::AttributeIterator& mAttrs; + }; + + // text within an element + struct TextEvent: public BaseEvent{ + // TextEvent binds the CharIterator. + TextEvent(xml::sax::CharIterator& text): + mText(text) + {} + + xml::sax::CharIterator& mText; + }; + + // The parsing coroutine instantiates BaseEvent subclass instances and + // successively shows them to the main program. It passes a reference so we + // don't slice the BaseEvent subclass. + typedef boost::coroutines::asymmetric_coroutine coro_t; + + void parser(coro_t::push_type& sink,std::istream& in){ + xml::sax::Parser xparser; + // startDocument() will send OpenEvent + xparser.startDocument([&sink](const xml::sax::Parser::TagType& name, + xml::sax::AttributeIterator& attrs) + { + sink(OpenEvent(name,attrs)); + }); + // startTag() will likewise send OpenEvent + xparser.startTag([&sink](const xml::sax::Parser::TagType& name, + xml::sax::AttributeIterator& attrs) + { + sink(OpenEvent(name,attrs)); + }); + // endTag() will send CloseEvent + xparser.endTag([&sink](const xml::sax::Parser::TagType& name) + { + sink(CloseEvent(name)); + }); + // endDocument() will likewise send CloseEvent + xparser.endDocument([&sink](const xml::sax::Parser::TagType& name) + { + sink(CloseEvent(name)); + }); + // characters() will send TextEvent + xparser.characters([&sink](xml::sax::CharIterator& text) + { + sink(TextEvent(text)); + }); + try + { + // parse the document, firing all the above + xparser.parse(in); + } + catch (xml::Exception e) + { + // xml::sax::Parser throws xml::Exception. Helpfully translate the + // name and provide it as the what() string. + throw std::runtime_error(exception_name(e)); + } + } + + // Recursively traverse the incoming XML document on the fly, pulling + // BaseEvent& references from 'events'. + // 'indent' illustrates the level of recursion. + // Each time we're called, we've just retrieved an OpenEvent from 'events'; + // accept that as a param. + // Return the CloseEvent that ends this element. + const CloseEvent& process(coro_t::pull_type& events,const OpenEvent& context, + const std::string& indent=""){ + // Capture OpenEvent's tag name: as soon as we advance the parser, the + // TagType& reference bound in this OpenEvent will be invalidated. + xml::sax::Parser::TagType tagName = context.mName; + // Since the OpenEvent is still the current value from 'events', pass + // control back to 'events' until the next event. Of course, each time we + // come back we must check for the end of the results stream. + while(events()){ + // Another event is pending; retrieve it. + const BaseEvent& event=events.get(); + const OpenEvent* oe; + const CloseEvent* ce; + const TextEvent* te; + if((oe=dynamic_cast(&event))){ + // When we see OpenEvent, recursively process it. + process(events,*oe,indent+" "); + } + else if((ce=dynamic_cast(&event))){ + // When we see CloseEvent, validate its tag name and then return + // it. (This assert is really a check on xml::sax::Parser, since + // it already validates matching open/close tags.) + assert(ce->mName == tagName); + return *ce; + } + else if((te=dynamic_cast(&event))){ + // When we see TextEvent, just report its text, along with + // indentation indicating recursion level. + std::cout<mText.getText()<<"'\n"; + } + } + } + + // pretend we have an XML file of arbitrary size + std::istringstream in(doc); + try + { + coro_t::pull_type events(std::bind(parser,_1,std::ref(in))); + // We fully expect at least ONE event. + assert(events); + // This dynamic_cast<&> is itself an assertion that the first event is an + // OpenEvent. + const OpenEvent& context=dynamic_cast(events.get()); + process(events, context); + } + catch (std::exception& e) + { + std::cout << "Parsing error: " << e.what() << '\n'; + } + +This problem does not map at all well to communicating between independent +threads. It makes no sense for either side to proceed independently of the +other. You want them to pass control back and forth. + +The solution involves a small polymorphic class event hierarchy, to which +we're passing references. The actual instances are temporaries on the +coroutine's stack; the coroutine passes each reference in turn to the main +logic. Copying them as base-class values would slice them. + +If we were trying to let the SAX parser proceed independently of the consuming +logic, one could imagine allocating event-subclass instances on the heap, +passing them along on a thread-safe queue of pointers. But that doesn't work +either, because these event classes bind references passed by the SAX parser. +The moment the parser moves on, those references become invalid. + +Instead of binding a ['TagType&] reference, we could store a copy of +the ['TagType] in ['CloseEvent]. But that doesn't solve the whole +problem. For attributes, we get an ['AttributeIterator&]; for text we get +a ['CharIterator&]. Storing a copy of those iterators is pointless: once +the parser moves on, those iterators are invalidated. You must process the +attribute iterator (or character iterator) during the SAX callback for that +event. + +Naturally we could retrieve and store a copy of every attribute and its value; +we could store a copy of every chunk of text. That would effectively be all +the text in the document -- a heavy price to pay, if the reason we're using +SAX is concern about fitting the entire DOM into memory. + +There's yet another advantage to using coroutines. This SAX parser throws an +exception when parsing fails. With a coroutine implementation, you need only +wrap the calling code in try/catch. + +With communicating threads, you would have to arrange to catch the exception +and pass along the exception pointer on the same queue you're using to deliver +the other events. You would then have to rethrow the exception to unwind the +recursive document processing. + +The coroutine solution maps very naturally to the problem space. + [heading 'same fringe' problem]