/** -------------------------------------------------------------------- * @file mime-parsefull.cc * @brief Implementation of main mime parser components * @author Andreas Aardal Hanssen * @date 2002-2005 * ----------------------------------------------------------------- **/ #include "mime.h" #include "mime-utils.h" #include "mime-inputsource.h" #include "convert.h" #include #include #include #include #include #include #include #include #include Binc::MimeInputSource *mimeSource = 0; using namespace ::std; //------------------------------------------------------------------------ void Binc::MimeDocument::parseFull(int fd) const { if (allIsParsed) return; allIsParsed = true; if (!mimeSource || mimeSource->getFileDescriptor() != fd) { delete mimeSource; mimeSource = new MimeInputSource(fd); } else { mimeSource->reset(); } headerstartoffsetcrlf = 0; headerlength = 0; bodystartoffsetcrlf = 0; bodylength = 0; size = 0; messagerfc822 = false; multipart = false; int bsize = 0; string bound; MimePart::parseFull(bound, bsize); // eat any trailing junk to get the correct size char c; while (mimeSource->getChar(&c)); size = mimeSource->getOffset(); } //------------------------------------------------------------------------ static bool parseOneHeaderLine(Binc::Header *header, unsigned int *nlines) { using namespace ::Binc; char c; bool eof = false; char cqueue[4]; string name; string content; while (mimeSource->getChar(&c)) { // If we encounter a \r before we got to the first ':', then // rewind back to the start of the line and assume we're at the // start of the body. if (c == '\r') { for (int i = 0; i < (int) name.length() + 1; ++i) mimeSource->ungetChar(); return false; } // A colon marks the end of the header name if (c == ':') break; // Otherwise add to the header name name += c; } cqueue[0] = '\0'; cqueue[1] = '\0'; cqueue[2] = '\0'; cqueue[3] = '\0'; // Read until the end of the header. bool endOfHeaders = false; while (!endOfHeaders) { if (!mimeSource->getChar(&c)) { eof = true; break; } if (c == '\n') ++*nlines; for (int i = 0; i < 3; ++i) cqueue[i] = cqueue[i + 1]; cqueue[3] = c; if (strncmp(cqueue, "\r\n\r\n", 4) == 0) { endOfHeaders = true; break; } // If the last character was a newline, and the first now is not // whitespace, then rewind one character and store the current // key,value pair. if (cqueue[2] == '\n' && c != ' ' && c != '\t') { if (content.length() > 2) content.resize(content.length() - 2); trim(content); header->add(name, content); if (c != '\r') { mimeSource->ungetChar(); if (c == '\n') --*nlines; return true; } mimeSource->getChar(&c); return false; } content += c; } if (name != "") { if (content.length() > 2) content.resize(content.length() - 2); header->add(name, content); } return !(eof || endOfHeaders); } //------------------------------------------------------------------------ static void parseHeader(Binc::Header *header, unsigned int *nlines) { while (parseOneHeaderLine(header, nlines)) { } } //------------------------------------------------------------------------ static void analyzeHeader(Binc::Header *header, bool *multipart, bool *messagerfc822, string *subtype, string *boundary) { using namespace ::Binc; // Do simple parsing of headers to determine the // type of message (multipart,messagerfc822 etc) HeaderItem ctype; if (header->getFirstHeader("content-type", ctype)) { vector types; split(ctype.getValue(), ";", types); if (types.size() > 0) { // first element should describe content type string tmp = types[0]; trim(tmp); vector v; split(tmp, "/", v); string key, value; key = (v.size() > 0) ? v[0] : "text"; value = (v.size() > 1) ? v[1] : "plain"; lowercase(key); if (key == "multipart") { *multipart = true; lowercase(value); *subtype = value; } else if (key == "message") { lowercase(value); if (value == "rfc822") *messagerfc822 = true; } } for (vector::const_iterator i = types.begin(); i != types.end(); ++i) { string element = *i; trim(element); if (element.find("=") != string::npos) { string::size_type pos = element.find('='); string key = element.substr(0, pos); string value = element.substr(pos + 1); lowercase(key); trim(key); if (key == "boundary") { trim(value, " \""); *boundary = value; } } } } } static void parseMessageRFC822(vector *members, bool *foundendofpart, unsigned int *bodylength, unsigned int *nbodylines, const string &toboundary) { using namespace ::Binc; // message rfc822 means a completely enclosed mime document. we // call the parser recursively, and pass on the boundary string // that we got. when parse() finds this boundary, it returns 0. if // it finds the end boundary (boundary + "--"), it returns != 0. MimePart m; unsigned int bodystartoffsetcrlf = mimeSource->getOffset(); // parsefull returns the number of bytes that need to be removed // from the body because of the terminating boundary string. int bsize = 0; if (m.parseFull(toboundary, bsize)) *foundendofpart = true; // make sure bodylength doesn't overflow *bodylength = mimeSource->getOffset(); if (*bodylength >= bodystartoffsetcrlf) { *bodylength -= bodystartoffsetcrlf; if (*bodylength >= (unsigned int) bsize) { *bodylength -= (unsigned int) bsize; } else { *bodylength = 0; } } else { *bodylength = 0; } *nbodylines += m.getNofLines(); members->push_back(m); } static bool skipUntilBoundary(const string &delimiter, unsigned int *nlines, bool *eof) { int endpos = delimiter.length(); char *delimiterqueue = 0; int delimiterpos = 0; const char *delimiterStr = delimiter.c_str(); if (delimiter != "") { delimiterqueue = new char[endpos]; memset(delimiterqueue, 0, endpos); } // first, skip to the first delimiter string. Anything between the // header and the first delimiter string is simply ignored (it's // usually a text message intended for non-mime clients) char c; bool foundBoundary = false; for (;;) { if (!mimeSource->getChar(&c)) { *eof = true; break; } if (c == '\n') ++*nlines; // if there is no delimiter, we just read until the end of the // file. if (!delimiterqueue) continue; delimiterqueue[delimiterpos++ % endpos] = c; if (compareStringToQueue(delimiterStr, delimiterqueue, delimiterpos, endpos)) { foundBoundary = true; break; } } delete[] delimiterqueue; delimiterqueue = 0; return foundBoundary; } static void parseMultipart(const string &boundary, const string &toboundary, bool *eof, unsigned int *nlines, int *boundarysize, bool *foundendofpart, unsigned int *bodylength, vector *members) { using namespace ::Binc; unsigned int bodystartoffsetcrlf = mimeSource->getOffset(); // multipart parsing starts with skipping to the first // boundary. then we call parse() for all parts. the last parse() // command will return a code indicating that it found the last // boundary of this multipart. Note that the first boundary does // not have to start with CRLF. string delimiter = "--" + boundary; skipUntilBoundary(delimiter, nlines, eof); if (!eof) *boundarysize = delimiter.size(); // Read two more characters. This may be CRLF, it may be "--" and // it may be any other two characters. char a; if (!mimeSource->getChar(&a)) *eof = true; if (a == '\n') ++*nlines; char b; if (!mimeSource->getChar(&b)) *eof = true; if (b == '\n') ++*nlines; // If we find two dashes after the boundary, then this is the end // of boundary marker. if (!*eof) { if (a == '-' && b == '-') { *foundendofpart = true; *boundarysize += 2; if (!mimeSource->getChar(&a)) *eof = true; if (a == '\n') ++*nlines; if (!mimeSource->getChar(&b)) *eof = true; if (b == '\n') ++*nlines; } if (a == '\r' && b == '\n') { // This exception is to handle a special case where the // delimiter of one part is not followed by CRLF, but // immediately followed by a CRLF prefixed delimiter. if (!mimeSource->getChar(&a) || !mimeSource->getChar(&b)) *eof = true; else if (a == '-' && b == '-') { mimeSource->ungetChar(); mimeSource->ungetChar(); mimeSource->ungetChar(); mimeSource->ungetChar(); } else { mimeSource->ungetChar(); mimeSource->ungetChar(); } *boundarysize += 2; } else { mimeSource->ungetChar(); mimeSource->ungetChar(); } } // read all mime parts. if (!*foundendofpart && !*eof) { bool quit = false; do { MimePart m; // If parseFull returns != 0, then it encountered the multipart's // final boundary. int bsize = 0; if (m.parseFull(boundary, bsize)) { quit = true; *boundarysize = bsize; } members->push_back(m); } while (!quit); } if (!*foundendofpart && !*eof) { // multipart parsing starts with skipping to the first // boundary. then we call parse() for all parts. the last parse() // command will return a code indicating that it found the last // boundary of this multipart. Note that the first boundary does // not have to start with CRLF. string delimiter = "\r\n--" + toboundary; skipUntilBoundary(delimiter, nlines, eof); if (!*eof) *boundarysize = delimiter.size(); // Read two more characters. This may be CRLF, it may be "--" and // it may be any other two characters. char a = '\0'; if (!mimeSource->getChar(&a)) *eof = true; if (a == '\n') ++*nlines; char b = '\0'; if (!mimeSource->getChar(&b)) *eof = true; if (b == '\n') ++*nlines; // If we find two dashes after the boundary, then this is the end // of boundary marker. if (!*eof) { if (a == '-' && b == '-') { *foundendofpart = true; *boundarysize += 2; if (!mimeSource->getChar(&a)) *eof = true; if (a == '\n') ++*nlines; if (!mimeSource->getChar(&b)) *eof = true; if (b == '\n') ++*nlines; } if (a == '\r' && b == '\n') { // This exception is to handle a special case where the // delimiter of one part is not followed by CRLF, but // immediately followed by a CRLF prefixed delimiter. if (!mimeSource->getChar(&a) || !mimeSource->getChar(&b)) *eof = true; else if (a == '-' && b == '-') { mimeSource->ungetChar(); mimeSource->ungetChar(); mimeSource->ungetChar(); mimeSource->ungetChar(); } else { mimeSource->ungetChar(); mimeSource->ungetChar(); } *boundarysize += 2; } else { mimeSource->ungetChar(); mimeSource->ungetChar(); } } } // make sure bodylength doesn't overflow *bodylength = mimeSource->getOffset(); if (*bodylength >= bodystartoffsetcrlf) { *bodylength -= bodystartoffsetcrlf; if (*bodylength >= (unsigned int) *boundarysize) { *bodylength -= (unsigned int) *boundarysize; } else { *bodylength = 0; } } else { *bodylength = 0; } } static void parseSinglePart(const string &toboundary, int *boundarysize, unsigned int *nbodylines, unsigned int *nlines, bool *eof, bool *foundendofpart, unsigned int *bodylength) { using namespace ::Binc; unsigned int bodystartoffsetcrlf = mimeSource->getOffset(); // If toboundary is empty, then we read until the end of the // file. Otherwise we will read until we encounter toboundary. string _toboundary; if (toboundary != "") { _toboundary = "\r\n--"; _toboundary += toboundary; } // if (skipUntilBoundary(_toboundary, nlines, eof)) // *boundarysize = _toboundary.length(); char *boundaryqueue = 0; int endpos = _toboundary.length(); if (toboundary != "") { boundaryqueue = new char[endpos]; memset(boundaryqueue, 0, endpos); } int boundarypos = 0; *boundarysize = 0; const char *_toboundaryStr = _toboundary.c_str(); string line; bool toboundaryIsEmpty = (toboundary == ""); char c; while (mimeSource->getChar(&c)) { if (c == '\n') { ++*nbodylines; ++*nlines; } if (toboundaryIsEmpty) continue; // find boundary boundaryqueue[boundarypos++ % endpos] = c; if (compareStringToQueue(_toboundaryStr, boundaryqueue, boundarypos, endpos)) { *boundarysize = _toboundary.length(); break; } } delete[] boundaryqueue; if (toboundary != "") { char a; if (!mimeSource->getChar(&a)) *eof = true; if (a == '\n') ++*nlines; char b; if (!mimeSource->getChar(&b)) *eof = true; if (b == '\n') ++*nlines; if (a == '-' && b == '-') { *boundarysize += 2; *foundendofpart = true; if (!mimeSource->getChar(&a)) *eof = true; if (a == '\n') ++*nlines; if (!mimeSource->getChar(&b)) *eof = true; if (b == '\n') ++*nlines; } if (a == '\r' && b == '\n') { // This exception is to handle a special case where the // delimiter of one part is not followed by CRLF, but // immediately followed by a CRLF prefixed delimiter. if (!mimeSource->getChar(&a) || !mimeSource->getChar(&b)) *eof = true; else if (a == '-' && b == '-') { mimeSource->ungetChar(); mimeSource->ungetChar(); mimeSource->ungetChar(); mimeSource->ungetChar(); } else { mimeSource->ungetChar(); mimeSource->ungetChar(); } *boundarysize += 2; } else { mimeSource->ungetChar(); mimeSource->ungetChar(); } } // make sure bodylength doesn't overflow *bodylength = mimeSource->getOffset(); if (*bodylength >= bodystartoffsetcrlf) { *bodylength -= bodystartoffsetcrlf; if (*bodylength >= (unsigned int) *boundarysize) { *bodylength -= (unsigned int) *boundarysize; } else { *bodylength = 0; } } else { *bodylength = 0; } } //------------------------------------------------------------------------ int Binc::MimePart::parseFull(const string &toboundary, int &boundarysize) const { headerstartoffsetcrlf = mimeSource->getOffset(); // Parse the header of this mime part. parseHeader(&h, &nlines); // Headerlength includes the seperating CRLF. Body starts after the // CRLF. headerlength = mimeSource->getOffset() - headerstartoffsetcrlf; bodystartoffsetcrlf = mimeSource->getOffset(); bodylength = 0; // Determine the type of mime part by looking at fields in the // header. analyzeHeader(&h, &multipart, &messagerfc822, &subtype, &boundary); bool eof = false; bool foundendofpart = false; if (messagerfc822) { parseMessageRFC822(&members, &foundendofpart, &bodylength, &nbodylines, toboundary); } else if (multipart) { parseMultipart(boundary, toboundary, &eof, &nlines, &boundarysize, &foundendofpart, &bodylength, &members); } else { parseSinglePart(toboundary, &boundarysize, &nbodylines, &nlines, &eof, &foundendofpart, &bodylength); } return (eof || foundendofpart) ? 1 : 0; }