quickbook/utils.cpp

/*=============================================================================
    Copyright (c) 2002 2004 2006 Joel de Guzman
    Copyright (c) 2004 Eric Niebler
    http://spirit.sourceforge.net/

    Use, modification and distribution is subject to the Boost Software
    License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at
    http://www.boost.org/LICENSE_1_0.txt)
=============================================================================*/
#include "utils.hpp"
#include <boost/spirit/include/classic_core.hpp>

#include <cctype>
#include <cstring>
#include <stdexcept>
#include <fstream>
#include <iostream>
#include <map>

namespace quickbook {
    extern bool ms_errors;
}

namespace quickbook { namespace detail
{
    char filter_identifier_char(char ch)
    {
        if (!std::isalnum(static_cast<unsigned char>(ch)))
            ch = '_';
        return static_cast<char>(std::tolower(static_cast<unsigned char>(ch)));
    }

    // un-indent a code segment
    void unindent(std::string& program)
    {
        // Erase leading blank lines and newlines:
        std::string::size_type start = program.find_first_not_of(" \t");
        if (start != std::string::npos &&
            (program[start] == '\r' || program[start] == '\n'))
        {
            program.erase(0, start);
        }
        start = program.find_first_not_of("\r\n");
        program.erase(0, start);

        if (program.size() == 0)
            return; // nothing left to do

        // Get the first line indent
        std::string::size_type indent = program.find_first_not_of(" \t");
        std::string::size_type pos = 0;
        if (std::string::npos == indent)
        {
            // Nothing left to do here. The code is empty (just spaces).
            // We clear the program to signal the caller that it is empty
            // and return early.
            program.clear();
            return;
        }

        // Calculate the minimum indent from the rest of the lines
        do
        {
            pos = program.find_first_not_of("\r\n", pos);
            if (std::string::npos == pos)
                break;

            std::string::size_type n = program.find_first_not_of(" \t", pos);
            if (n != std::string::npos)
            {
                char ch = program[n];
                if (ch != '\r' && ch != '\n') // ignore empty lines
                    indent = (std::min)(indent, n-pos);
            }
        }
        while (std::string::npos != (pos = program.find_first_of("\r\n", pos)));

        // Trim white spaces from column 0..indent
        pos = 0;
        program.erase(0, indent);
        while (std::string::npos != (pos = program.find_first_of("\r\n", pos)))
        {
            if (std::string::npos == (pos = program.find_first_not_of("\r\n", pos)))
            {
                break;
            }

            std::string::size_type next = program.find_first_of("\r\n", pos);
            program.erase(pos, (std::min)(indent, next-pos));
        }
    }

    // remove the extension from a filename
    std::string
    remove_extension(std::string const& filename)
    {
        std::string::size_type const n = filename.find_last_of('.');
        if(std::string::npos == n)
        {
            return filename;
        }
        else
        {
            return std::string(filename.begin(), filename.begin()+n);
        }
    }

    std::string escape_uri(std::string uri)
    {
        for (std::string::size_type n = 0; n < uri.size(); ++n)
        {
            static char const mark[] = "-_.!~*'()?\\/";
            if((!std::isalnum(static_cast<unsigned char>(uri[n])) || 127 < static_cast<unsigned char>(uri[n]))
              && 0 == std::strchr(mark, uri[n]))
            {
                static char const hex[] = "0123456789abcdef";
                char escape[] = { hex[uri[n] / 16], hex[uri[n] % 16] };
                uri.insert(n + 1, escape, 2);
                uri[n] = '%';
                n += 2;
            }
        }
        return uri;
    }

    std::ostream& outerr(std::string const& file, int line)
    {
        if (line >= 0)
        {
            if (ms_errors)
                return std::clog << file << "(" << line << "): error: ";
            else
                return std::clog << file << ":" << line << ": error: ";
        }
        else
        {
            return std::clog << file << ": error: ";
        }
    }

    std::ostream& outwarn(std::string const& file, int line)
    {
        if (line >= 0)
        {
            if (ms_errors)
                return std::clog << file << "(" << line << "): warning: ";
            else
                return std::clog << file << ":" << line << ": warning: ";
        }
        else
        {
            return std::clog << file << ": warning: ";
        }
    }

    // Read the first few bytes in a file to see it starts with a byte order
    // mark. If it doesn't, then write the characters we've already read in.
    // Although, given how UTF-8 works, if even part of the BOM was matched,
    // the file is probably broken.

    template <class InputIterator, class OutputIterator>
    bool check_bom(InputIterator& begin, InputIterator end,
            OutputIterator out, char const* chars, int length)
    {
        char const* ptr = chars;

        while(begin != end && *begin == *ptr) {
            ++begin;
            ++ptr;
            --length;
            if(length == 0) return true;
        }

        // Failed to match, so write the skipped characters to storage:
        while(chars != ptr) *out++ = *chars++;

        return false;
    }

    template <class InputIterator, class OutputIterator>
    std::string read_bom(InputIterator& begin, InputIterator end,
            OutputIterator out)
    {
        if(begin == end) return "";

        const char utf8[] = {0xef, 0xbb, 0xbf};
        const char utf32be[] = {0, 0, 0xfe, 0xff};
        const char utf32le[] = {0xff, 0xfe, 0, 0};

        unsigned char c = *begin;
        switch(c)
        {
        case 0xEF: { // UTF-8
            return check_bom(begin, end, out, utf8, 3) ? "UTF-8" : "";
        }
        case 0xFF: // UTF-16/UTF-32 little endian
            return !check_bom(begin, end, out, utf32le, 2) ? "" :
                check_bom(begin, end, out, utf32le + 2, 2) ? "UTF-32" : "UTF-16";
        case 0: // UTF-32 big endian
            return check_bom(begin, end, out, utf32be, 4) ? "UTF-32" : "";
        case 0xFE: // UTF-16 big endian
            return check_bom(begin, end, out, utf32be + 2, 2) ? "UTF-16" : "";
        default:
            return "";
        }
    }

    // Copy a string, converting mac and windows style newlines to unix
    // newlines.

    template <class InputIterator, class OutputIterator>
    bool normalize(InputIterator begin, InputIterator end,
            OutputIterator out, std::string const& filename)
    {
        std::string encoding = read_bom(begin, end, out);

        if(encoding != "UTF-8" && encoding != "") {
            outerr(filename) << encoding << " is not supported. Please use UTF-8."
                << std::endl;

            return false;
        }

        while(begin != end) {
            if(*begin == '\r') {
                *out++ = '\n';
                ++begin;
                if(begin != end && *begin == '\n') ++begin;
            }
            else {
                *out++ = *begin++;
            }
        }

        return true;
    }

    int load(std::string const& filename, std::string& storage)
    {
        using std::cerr;
        using std::endl;
        using std::ios;
        using std::ifstream;
        using std::istream_iterator;

        ifstream in(filename.c_str(), std::ios_base::in);

        if (!in)
        {
            outerr(filename) << "Could not open input file." << endl;
            return 1;
        }

        // Turn off white space skipping on the stream
        in.unsetf(ios::skipws);

        if(!normalize(
            istream_iterator<char>(in),
            istream_iterator<char>(),
            std::back_inserter(storage),
            filename))
        {
            return 1;
        }

        //  ensure that we have enough trailing newlines to eliminate
        //  the need to check for end of file in the grammar.
        storage.push_back('\n');
        storage.push_back('\n');
        return 0;
    }

    file_type get_file_type(std::string const& extension)
    {
        static std::map<std::string, file_type> ftypes;
        if (ftypes.empty())
        {
            // init the map of types
            ftypes["cpp"] = cpp_file;
            ftypes["hpp"] = cpp_file;
            ftypes["h"] = cpp_file;
            ftypes["c"] = cpp_file;
            ftypes["cxx"] = cpp_file;
            ftypes["hxx"] = cpp_file;
            ftypes["ipp"] = cpp_file;
            ftypes["py"] = python_file;
        }
        return ftypes[extension];
    }

}}