diff options
Diffstat (limited to 'src/mime-parsefull.cc')
-rw-r--r-- | src/mime-parsefull.cc | 603 |
1 files changed, 603 insertions, 0 deletions
diff --git a/src/mime-parsefull.cc b/src/mime-parsefull.cc new file mode 100644 index 0000000..53d07db --- /dev/null +++ b/src/mime-parsefull.cc @@ -0,0 +1,603 @@ +/** -------------------------------------------------------------------- + * @file mime-parsefull.cc + * @brief Implementation of main mime parser components + * @author Andreas Aardal Hanssen + * @date 2002-2005 + * ----------------------------------------------------------------- **/ +#include "mime.h" +#include "mime-utils.h" +#include "mime-inputsource.h" +#include "convert.h" +#include <string> +#include <vector> +#include <map> +#include <exception> +#include <iostream> + +#include <string.h> +#include <ctype.h> +#include <stdio.h> +#include <errno.h> + +Binc::MimeInputSource *mimeSource = 0; + +using namespace ::std; + +//------------------------------------------------------------------------ +void Binc::MimeDocument::parseFull(int fd) const +{ + if (allIsParsed) + return; + + allIsParsed = true; + + if (!mimeSource || mimeSource->getFileDescriptor() != fd) { + delete mimeSource; + mimeSource = new MimeInputSource(fd); + } else { + mimeSource->reset(); + } + + headerstartoffsetcrlf = 0; + headerlength = 0; + bodystartoffsetcrlf = 0; + bodylength = 0; + size = 0; + messagerfc822 = false; + multipart = false; + + int bsize = 0; + string bound; + MimePart::parseFull(bound, bsize); + + // eat any trailing junk to get the correct size + char c; + while (mimeSource->getChar(&c)); + + size = mimeSource->getOffset(); +} + +//------------------------------------------------------------------------ +static bool parseOneHeaderLine(Binc::Header *header, unsigned int *nlines) +{ + using namespace ::Binc; + char c; + bool eof = false; + char cqueue[4]; + string name; + string content; + + while (mimeSource->getChar(&c)) { + // If we encounter a \r before we got to the first ':', then + // rewind back to the start of the line and assume we're at the + // start of the body. + if (c == '\r') { + for (int i = 0; i < (int) name.length() + 1; ++i) + mimeSource->ungetChar(); + return false; + } + + // A colon marks the end of the header name + if (c == ':') break; + + // Otherwise add to the header name + name += c; + } + + cqueue[0] = '\0'; + cqueue[1] = '\0'; + cqueue[2] = '\0'; + cqueue[3] = '\0'; + + // Read until the end of the header. + bool endOfHeaders = false; + while (!endOfHeaders) { + if (!mimeSource->getChar(&c)) { + eof = true; + break; + } + + if (c == '\n') ++*nlines; + + for (int i = 0; i < 3; ++i) + cqueue[i] = cqueue[i + 1]; + cqueue[3] = c; + + if (strncmp(cqueue, "\r\n\r\n", 4) == 0) { + endOfHeaders = true; + break; + } + + // If the last character was a newline, and the first now is not + // whitespace, then rewind one character and store the current + // key,value pair. + if (cqueue[2] == '\n' && c != ' ' && c != '\t') { + if (content.length() > 2) + content.resize(content.length() - 2); + + trim(content); + header->add(name, content); + + if (c != '\r') { + mimeSource->ungetChar(); + if (c == '\n') --*nlines; + return true; + } + + mimeSource->getChar(&c); + return false; + } + + content += c; + } + + if (name != "") { + if (content.length() > 2) + content.resize(content.length() - 2); + header->add(name, content); + } + + return !(eof || endOfHeaders); +} + +//------------------------------------------------------------------------ +static void parseHeader(Binc::Header *header, unsigned int *nlines) +{ + while (parseOneHeaderLine(header, nlines)) + { } +} + +//------------------------------------------------------------------------ +static void analyzeHeader(Binc::Header *header, bool *multipart, + bool *messagerfc822, string *subtype, string *boundary) +{ + using namespace ::Binc; + + // Do simple parsing of headers to determine the + // type of message (multipart,messagerfc822 etc) + HeaderItem ctype; + if (header->getFirstHeader("content-type", ctype)) { + vector<string> types; + split(ctype.getValue(), ";", types); + + if (types.size() > 0) { + // first element should describe content type + string tmp = types[0]; + trim(tmp); + vector<string> v; + split(tmp, "/", v); + string key, value; + + key = (v.size() > 0) ? v[0] : "text"; + value = (v.size() > 1) ? v[1] : "plain"; + lowercase(key); + + if (key == "multipart") { + *multipart = true; + lowercase(value); + *subtype = value; + } else if (key == "message") { + lowercase(value); + if (value == "rfc822") + *messagerfc822 = true; + } + } + + for (vector<string>::const_iterator i = types.begin(); + i != types.end(); ++i) { + string element = *i; + trim(element); + + if (element.find("=") != string::npos) { + string::size_type pos = element.find('='); + string key = element.substr(0, pos); + string value = element.substr(pos + 1); + + lowercase(key); + trim(key); + + if (key == "boundary") { + trim(value, " \""); + *boundary = value; + } + } + } + } +} + +static void parseMessageRFC822(vector<Binc::MimePart> *members, + bool *foundendofpart, + unsigned int *bodylength, + unsigned int *nbodylines, + const string &toboundary) +{ + using namespace ::Binc; + + // message rfc822 means a completely enclosed mime document. we + // call the parser recursively, and pass on the boundary string + // that we got. when parse() finds this boundary, it returns 0. if + // it finds the end boundary (boundary + "--"), it returns != 0. + MimePart m; + + unsigned int bodystartoffsetcrlf = mimeSource->getOffset(); + + // parsefull returns the number of bytes that need to be removed + // from the body because of the terminating boundary string. + int bsize = 0; + if (m.parseFull(toboundary, bsize)) + *foundendofpart = true; + + // make sure bodylength doesn't overflow + *bodylength = mimeSource->getOffset(); + if (*bodylength >= bodystartoffsetcrlf) { + *bodylength -= bodystartoffsetcrlf; + if (*bodylength >= (unsigned int) bsize) { + *bodylength -= (unsigned int) bsize; + } else { + *bodylength = 0; + } + } else { + *bodylength = 0; + } + + *nbodylines += m.getNofLines(); + + members->push_back(m); +} + +static bool skipUntilBoundary(const string &delimiter, + unsigned int *nlines, bool *eof) +{ + int endpos = delimiter.length(); + char *delimiterqueue = 0; + int delimiterpos = 0; + const char *delimiterStr = delimiter.c_str(); + if (delimiter != "") { + delimiterqueue = new char[endpos]; + memset(delimiterqueue, 0, endpos); + } + + // first, skip to the first delimiter string. Anything between the + // header and the first delimiter string is simply ignored (it's + // usually a text message intended for non-mime clients) + char c; + + bool foundBoundary = false; + for (;;) { + if (!mimeSource->getChar(&c)) { + *eof = true; + break; + } + + if (c == '\n') ++*nlines; + + // if there is no delimiter, we just read until the end of the + // file. + if (!delimiterqueue) continue; + + delimiterqueue[delimiterpos++ % endpos] = c; + + if (compareStringToQueue(delimiterStr, delimiterqueue, + delimiterpos, endpos)) { + foundBoundary = true; + break; + } + } + + delete[] delimiterqueue; + delimiterqueue = 0; + + return foundBoundary; +} + + +static void parseMultipart(const string &boundary, + const string &toboundary, + bool *eof, + unsigned int *nlines, + int *boundarysize, + bool *foundendofpart, + unsigned int *bodylength, + vector<Binc::MimePart> *members) +{ + using namespace ::Binc; + unsigned int bodystartoffsetcrlf = mimeSource->getOffset(); + + // multipart parsing starts with skipping to the first + // boundary. then we call parse() for all parts. the last parse() + // command will return a code indicating that it found the last + // boundary of this multipart. Note that the first boundary does + // not have to start with CRLF. + string delimiter = "--" + boundary; + + skipUntilBoundary(delimiter, nlines, eof); + + if (!eof) *boundarysize = delimiter.size(); + + // Read two more characters. This may be CRLF, it may be "--" and + // it may be any other two characters. + + char a; + if (!mimeSource->getChar(&a)) *eof = true; + if (a == '\n') ++*nlines; + + char b; + if (!mimeSource->getChar(&b)) *eof = true; + if (b == '\n') ++*nlines; + + // If we find two dashes after the boundary, then this is the end + // of boundary marker. + if (!*eof) { + if (a == '-' && b == '-') { + *foundendofpart = true; + *boundarysize += 2; + + if (!mimeSource->getChar(&a)) *eof = true; + if (a == '\n') ++*nlines; + if (!mimeSource->getChar(&b)) *eof = true; + if (b == '\n') ++*nlines; + } + + if (a == '\r' && b == '\n') { + // This exception is to handle a special case where the + // delimiter of one part is not followed by CRLF, but + // immediately followed by a CRLF prefixed delimiter. + if (!mimeSource->getChar(&a) || !mimeSource->getChar(&b)) + *eof = true; + else if (a == '-' && b == '-') { + mimeSource->ungetChar(); + mimeSource->ungetChar(); + mimeSource->ungetChar(); + mimeSource->ungetChar(); + } else { + mimeSource->ungetChar(); + mimeSource->ungetChar(); + } + + *boundarysize += 2; + } else { + mimeSource->ungetChar(); + mimeSource->ungetChar(); + } + } + + // read all mime parts. + if (!*foundendofpart && !*eof) { + bool quit = false; + do { + MimePart m; + + // If parseFull returns != 0, then it encountered the multipart's + // final boundary. + int bsize = 0; + if (m.parseFull(boundary, bsize)) { + quit = true; + *boundarysize = bsize; + } + + members->push_back(m); + + } while (!quit); + } + + if (!*foundendofpart && !*eof) { + // multipart parsing starts with skipping to the first + // boundary. then we call parse() for all parts. the last parse() + // command will return a code indicating that it found the last + // boundary of this multipart. Note that the first boundary does + // not have to start with CRLF. + string delimiter = "\r\n--" + toboundary; + + skipUntilBoundary(delimiter, nlines, eof); + + if (!*eof) *boundarysize = delimiter.size(); + + // Read two more characters. This may be CRLF, it may be "--" and + // it may be any other two characters. + + char a = '\0'; + if (!mimeSource->getChar(&a)) *eof = true; + if (a == '\n') ++*nlines; + + char b = '\0'; + if (!mimeSource->getChar(&b)) *eof = true; + if (b == '\n') ++*nlines; + + // If we find two dashes after the boundary, then this is the end + // of boundary marker. + if (!*eof) { + if (a == '-' && b == '-') { + *foundendofpart = true; + *boundarysize += 2; + if (!mimeSource->getChar(&a)) + *eof = true; + if (a == '\n') + ++*nlines; + if (!mimeSource->getChar(&b)) + *eof = true; + if (b == '\n') + ++*nlines; + } + + if (a == '\r' && b == '\n') { + // This exception is to handle a special case where the + // delimiter of one part is not followed by CRLF, but + // immediately followed by a CRLF prefixed delimiter. + if (!mimeSource->getChar(&a) || !mimeSource->getChar(&b)) + *eof = true; + else if (a == '-' && b == '-') { + mimeSource->ungetChar(); + mimeSource->ungetChar(); + mimeSource->ungetChar(); + mimeSource->ungetChar(); + } else { + mimeSource->ungetChar(); + mimeSource->ungetChar(); + } + + *boundarysize += 2; + } else { + mimeSource->ungetChar(); + mimeSource->ungetChar(); + } + } + } + + // make sure bodylength doesn't overflow + *bodylength = mimeSource->getOffset(); + if (*bodylength >= bodystartoffsetcrlf) { + *bodylength -= bodystartoffsetcrlf; + if (*bodylength >= (unsigned int) *boundarysize) { + *bodylength -= (unsigned int) *boundarysize; + } else { + *bodylength = 0; + } + } else { + *bodylength = 0; + } +} + +static void parseSinglePart(const string &toboundary, + int *boundarysize, + unsigned int *nbodylines, + unsigned int *nlines, + bool *eof, bool *foundendofpart, + unsigned int *bodylength) +{ + using namespace ::Binc; + unsigned int bodystartoffsetcrlf = mimeSource->getOffset(); + + // If toboundary is empty, then we read until the end of the + // file. Otherwise we will read until we encounter toboundary. + string _toboundary; + if (toboundary != "") { + _toboundary = "\r\n--"; + _toboundary += toboundary; + } + + // if (skipUntilBoundary(_toboundary, nlines, eof)) + // *boundarysize = _toboundary.length(); + + char *boundaryqueue = 0; + int endpos = _toboundary.length(); + if (toboundary != "") { + boundaryqueue = new char[endpos]; + memset(boundaryqueue, 0, endpos); + } + int boundarypos = 0; + + *boundarysize = 0; + + const char *_toboundaryStr = _toboundary.c_str(); + string line; + bool toboundaryIsEmpty = (toboundary == ""); + char c; + while (mimeSource->getChar(&c)) { + if (c == '\n') { ++*nbodylines; ++*nlines; } + if (toboundaryIsEmpty) continue; + + // find boundary + boundaryqueue[boundarypos++ % endpos] = c; + + if (compareStringToQueue(_toboundaryStr, boundaryqueue, + boundarypos, endpos)) { + *boundarysize = _toboundary.length(); + break; + } + } + + delete[] boundaryqueue; + + if (toboundary != "") { + + char a; + if (!mimeSource->getChar(&a)) *eof = true; + if (a == '\n') ++*nlines; + + char b; + if (!mimeSource->getChar(&b)) *eof = true; + if (b == '\n') ++*nlines; + + if (a == '-' && b == '-') { + *boundarysize += 2; + *foundendofpart = true; + if (!mimeSource->getChar(&a)) *eof = true; + if (a == '\n') ++*nlines; + if (!mimeSource->getChar(&b)) *eof = true; + if (b == '\n') ++*nlines; + } + + if (a == '\r' && b == '\n') { + // This exception is to handle a special case where the + // delimiter of one part is not followed by CRLF, but + // immediately followed by a CRLF prefixed delimiter. + if (!mimeSource->getChar(&a) || !mimeSource->getChar(&b)) + *eof = true; + else if (a == '-' && b == '-') { + mimeSource->ungetChar(); + mimeSource->ungetChar(); + mimeSource->ungetChar(); + mimeSource->ungetChar(); + } else { + mimeSource->ungetChar(); + mimeSource->ungetChar(); + } + + *boundarysize += 2; + } else { + mimeSource->ungetChar(); + mimeSource->ungetChar(); + } + } + + // make sure bodylength doesn't overflow + *bodylength = mimeSource->getOffset(); + if (*bodylength >= bodystartoffsetcrlf) { + *bodylength -= bodystartoffsetcrlf; + if (*bodylength >= (unsigned int) *boundarysize) { + *bodylength -= (unsigned int) *boundarysize; + } else { + *bodylength = 0; + } + } else { + *bodylength = 0; + } + +} + +//------------------------------------------------------------------------ +int Binc::MimePart::parseFull(const string &toboundary, + int &boundarysize) const +{ + headerstartoffsetcrlf = mimeSource->getOffset(); + + // Parse the header of this mime part. + parseHeader(&h, &nlines); + + // Headerlength includes the seperating CRLF. Body starts after the + // CRLF. + headerlength = mimeSource->getOffset() - headerstartoffsetcrlf; + bodystartoffsetcrlf = mimeSource->getOffset(); + bodylength = 0; + + // Determine the type of mime part by looking at fields in the + // header. + analyzeHeader(&h, &multipart, &messagerfc822, &subtype, &boundary); + + bool eof = false; + bool foundendofpart = false; + + if (messagerfc822) { + parseMessageRFC822(&members, &foundendofpart, &bodylength, + &nbodylines, toboundary); + + } else if (multipart) { + parseMultipart(boundary, toboundary, &eof, &nlines, &boundarysize, + &foundendofpart, &bodylength, &members); + } else { + parseSinglePart(toboundary, &boundarysize, &nbodylines, &nlines, + &eof, &foundendofpart, &bodylength); + } + + return (eof || foundendofpart) ? 1 : 0; +} |