mirror of
https://github.com/boostorg/coroutine.git
synced 2026-01-19 16:12:18 +00:00
add recursive SAX parsing to documentation
This commit is contained in:
@@ -181,6 +181,195 @@ code and local data (['data]) while using asynchronous operations
|
||||
(['async_read()], ['async_write()]). The algorithm is implemented in one
|
||||
function and error handling is done by one try-catch block.
|
||||
|
||||
[heading recursive SAX parsing]
|
||||
To someone who knows SAX, the phrase "recursive SAX parsing" might sound
|
||||
nonsensical. You get callbacks from SAX; you have to manage the element stack
|
||||
yourself. If you want recursive XML processing, you must first read the entire
|
||||
DOM into memory, then walk the tree.
|
||||
|
||||
But coroutines let you invert the flow of control so you can ask for SAX
|
||||
events. Once you can do that, you can process them recursively.
|
||||
|
||||
// Represent a subset of interesting SAX events
|
||||
struct BaseEvent{
|
||||
BaseEvent(const BaseEvent&)=delete;
|
||||
BaseEvent& operator=(const BaseEvent&)=delete;
|
||||
};
|
||||
|
||||
// End of document or element
|
||||
struct CloseEvent: public BaseEvent{
|
||||
// CloseEvent binds (without copying) the TagType reference.
|
||||
CloseEvent(const xml::sax::Parser::TagType& name):
|
||||
mName(name)
|
||||
{}
|
||||
|
||||
const xml::sax::Parser::TagType& mName;
|
||||
};
|
||||
|
||||
// Start of document or element
|
||||
struct OpenEvent: public CloseEvent{
|
||||
// In addition to CloseEvent's TagType, OpenEvent binds AttributeIterator.
|
||||
OpenEvent(const xml::sax::Parser::TagType& name,
|
||||
xml::sax::AttributeIterator& attrs):
|
||||
CloseEvent(name),
|
||||
mAttrs(attrs)
|
||||
{}
|
||||
|
||||
xml::sax::AttributeIterator& mAttrs;
|
||||
};
|
||||
|
||||
// text within an element
|
||||
struct TextEvent: public BaseEvent{
|
||||
// TextEvent binds the CharIterator.
|
||||
TextEvent(xml::sax::CharIterator& text):
|
||||
mText(text)
|
||||
{}
|
||||
|
||||
xml::sax::CharIterator& mText;
|
||||
};
|
||||
|
||||
// The parsing coroutine instantiates BaseEvent subclass instances and
|
||||
// successively shows them to the main program. It passes a reference so we
|
||||
// don't slice the BaseEvent subclass.
|
||||
typedef boost::coroutines::asymmetric_coroutine<const BaseEvent&> coro_t;
|
||||
|
||||
void parser(coro_t::push_type& sink,std::istream& in){
|
||||
xml::sax::Parser xparser;
|
||||
// startDocument() will send OpenEvent
|
||||
xparser.startDocument([&sink](const xml::sax::Parser::TagType& name,
|
||||
xml::sax::AttributeIterator& attrs)
|
||||
{
|
||||
sink(OpenEvent(name,attrs));
|
||||
});
|
||||
// startTag() will likewise send OpenEvent
|
||||
xparser.startTag([&sink](const xml::sax::Parser::TagType& name,
|
||||
xml::sax::AttributeIterator& attrs)
|
||||
{
|
||||
sink(OpenEvent(name,attrs));
|
||||
});
|
||||
// endTag() will send CloseEvent
|
||||
xparser.endTag([&sink](const xml::sax::Parser::TagType& name)
|
||||
{
|
||||
sink(CloseEvent(name));
|
||||
});
|
||||
// endDocument() will likewise send CloseEvent
|
||||
xparser.endDocument([&sink](const xml::sax::Parser::TagType& name)
|
||||
{
|
||||
sink(CloseEvent(name));
|
||||
});
|
||||
// characters() will send TextEvent
|
||||
xparser.characters([&sink](xml::sax::CharIterator& text)
|
||||
{
|
||||
sink(TextEvent(text));
|
||||
});
|
||||
try
|
||||
{
|
||||
// parse the document, firing all the above
|
||||
xparser.parse(in);
|
||||
}
|
||||
catch (xml::Exception e)
|
||||
{
|
||||
// xml::sax::Parser throws xml::Exception. Helpfully translate the
|
||||
// name and provide it as the what() string.
|
||||
throw std::runtime_error(exception_name(e));
|
||||
}
|
||||
}
|
||||
|
||||
// Recursively traverse the incoming XML document on the fly, pulling
|
||||
// BaseEvent& references from 'events'.
|
||||
// 'indent' illustrates the level of recursion.
|
||||
// Each time we're called, we've just retrieved an OpenEvent from 'events';
|
||||
// accept that as a param.
|
||||
// Return the CloseEvent that ends this element.
|
||||
const CloseEvent& process(coro_t::pull_type& events,const OpenEvent& context,
|
||||
const std::string& indent=""){
|
||||
// Capture OpenEvent's tag name: as soon as we advance the parser, the
|
||||
// TagType& reference bound in this OpenEvent will be invalidated.
|
||||
xml::sax::Parser::TagType tagName = context.mName;
|
||||
// Since the OpenEvent is still the current value from 'events', pass
|
||||
// control back to 'events' until the next event. Of course, each time we
|
||||
// come back we must check for the end of the results stream.
|
||||
while(events()){
|
||||
// Another event is pending; retrieve it.
|
||||
const BaseEvent& event=events.get();
|
||||
const OpenEvent* oe;
|
||||
const CloseEvent* ce;
|
||||
const TextEvent* te;
|
||||
if((oe=dynamic_cast<const OpenEvent*>(&event))){
|
||||
// When we see OpenEvent, recursively process it.
|
||||
process(events,*oe,indent+" ");
|
||||
}
|
||||
else if((ce=dynamic_cast<const CloseEvent*>(&event))){
|
||||
// When we see CloseEvent, validate its tag name and then return
|
||||
// it. (This assert is really a check on xml::sax::Parser, since
|
||||
// it already validates matching open/close tags.)
|
||||
assert(ce->mName == tagName);
|
||||
return *ce;
|
||||
}
|
||||
else if((te=dynamic_cast<const TextEvent*>(&event))){
|
||||
// When we see TextEvent, just report its text, along with
|
||||
// indentation indicating recursion level.
|
||||
std::cout<<indent<<"text: '"<<te->mText.getText()<<"'\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// pretend we have an XML file of arbitrary size
|
||||
std::istringstream in(doc);
|
||||
try
|
||||
{
|
||||
coro_t::pull_type events(std::bind(parser,_1,std::ref(in)));
|
||||
// We fully expect at least ONE event.
|
||||
assert(events);
|
||||
// This dynamic_cast<&> is itself an assertion that the first event is an
|
||||
// OpenEvent.
|
||||
const OpenEvent& context=dynamic_cast<const OpenEvent&>(events.get());
|
||||
process(events, context);
|
||||
}
|
||||
catch (std::exception& e)
|
||||
{
|
||||
std::cout << "Parsing error: " << e.what() << '\n';
|
||||
}
|
||||
|
||||
This problem does not map at all well to communicating between independent
|
||||
threads. It makes no sense for either side to proceed independently of the
|
||||
other. You want them to pass control back and forth.
|
||||
|
||||
The solution involves a small polymorphic class event hierarchy, to which
|
||||
we're passing references. The actual instances are temporaries on the
|
||||
coroutine's stack; the coroutine passes each reference in turn to the main
|
||||
logic. Copying them as base-class values would slice them.
|
||||
|
||||
If we were trying to let the SAX parser proceed independently of the consuming
|
||||
logic, one could imagine allocating event-subclass instances on the heap,
|
||||
passing them along on a thread-safe queue of pointers. But that doesn't work
|
||||
either, because these event classes bind references passed by the SAX parser.
|
||||
The moment the parser moves on, those references become invalid.
|
||||
|
||||
Instead of binding a ['TagType&] reference, we could store a copy of
|
||||
the ['TagType] in ['CloseEvent]. But that doesn't solve the whole
|
||||
problem. For attributes, we get an ['AttributeIterator&]; for text we get
|
||||
a ['CharIterator&]. Storing a copy of those iterators is pointless: once
|
||||
the parser moves on, those iterators are invalidated. You must process the
|
||||
attribute iterator (or character iterator) during the SAX callback for that
|
||||
event.
|
||||
|
||||
Naturally we could retrieve and store a copy of every attribute and its value;
|
||||
we could store a copy of every chunk of text. That would effectively be all
|
||||
the text in the document -- a heavy price to pay, if the reason we're using
|
||||
SAX is concern about fitting the entire DOM into memory.
|
||||
|
||||
There's yet another advantage to using coroutines. This SAX parser throws an
|
||||
exception when parsing fails. With a coroutine implementation, you need only
|
||||
wrap the calling code in try/catch.
|
||||
|
||||
With communicating threads, you would have to arrange to catch the exception
|
||||
and pass along the exception pointer on the same queue you're using to deliver
|
||||
the other events. You would then have to rethrow the exception to unwind the
|
||||
recursive document processing.
|
||||
|
||||
The coroutine solution maps very naturally to the problem space.
|
||||
|
||||
|
||||
[heading 'same fringe' problem]
|
||||
|
||||
|
||||
Reference in New Issue
Block a user