summaryrefslogtreecommitdiff
path: root/src/mime-parsefull.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/mime-parsefull.cc')
-rw-r--r--src/mime-parsefull.cc603
1 files changed, 603 insertions, 0 deletions
diff --git a/src/mime-parsefull.cc b/src/mime-parsefull.cc
new file mode 100644
index 0000000..53d07db
--- /dev/null
+++ b/src/mime-parsefull.cc
@@ -0,0 +1,603 @@
+/** --------------------------------------------------------------------
+ * @file mime-parsefull.cc
+ * @brief Implementation of main mime parser components
+ * @author Andreas Aardal Hanssen
+ * @date 2002-2005
+ * ----------------------------------------------------------------- **/
+#include "mime.h"
+#include "mime-utils.h"
+#include "mime-inputsource.h"
+#include "convert.h"
+#include <string>
+#include <vector>
+#include <map>
+#include <exception>
+#include <iostream>
+
+#include <string.h>
+#include <ctype.h>
+#include <stdio.h>
+#include <errno.h>
+
+Binc::MimeInputSource *mimeSource = 0;
+
+using namespace ::std;
+
+//------------------------------------------------------------------------
+void Binc::MimeDocument::parseFull(int fd) const
+{
+ if (allIsParsed)
+ return;
+
+ allIsParsed = true;
+
+ if (!mimeSource || mimeSource->getFileDescriptor() != fd) {
+ delete mimeSource;
+ mimeSource = new MimeInputSource(fd);
+ } else {
+ mimeSource->reset();
+ }
+
+ headerstartoffsetcrlf = 0;
+ headerlength = 0;
+ bodystartoffsetcrlf = 0;
+ bodylength = 0;
+ size = 0;
+ messagerfc822 = false;
+ multipart = false;
+
+ int bsize = 0;
+ string bound;
+ MimePart::parseFull(bound, bsize);
+
+ // eat any trailing junk to get the correct size
+ char c;
+ while (mimeSource->getChar(&c));
+
+ size = mimeSource->getOffset();
+}
+
+//------------------------------------------------------------------------
+static bool parseOneHeaderLine(Binc::Header *header, unsigned int *nlines)
+{
+ using namespace ::Binc;
+ char c;
+ bool eof = false;
+ char cqueue[4];
+ string name;
+ string content;
+
+ while (mimeSource->getChar(&c)) {
+ // If we encounter a \r before we got to the first ':', then
+ // rewind back to the start of the line and assume we're at the
+ // start of the body.
+ if (c == '\r') {
+ for (int i = 0; i < (int) name.length() + 1; ++i)
+ mimeSource->ungetChar();
+ return false;
+ }
+
+ // A colon marks the end of the header name
+ if (c == ':') break;
+
+ // Otherwise add to the header name
+ name += c;
+ }
+
+ cqueue[0] = '\0';
+ cqueue[1] = '\0';
+ cqueue[2] = '\0';
+ cqueue[3] = '\0';
+
+ // Read until the end of the header.
+ bool endOfHeaders = false;
+ while (!endOfHeaders) {
+ if (!mimeSource->getChar(&c)) {
+ eof = true;
+ break;
+ }
+
+ if (c == '\n') ++*nlines;
+
+ for (int i = 0; i < 3; ++i)
+ cqueue[i] = cqueue[i + 1];
+ cqueue[3] = c;
+
+ if (strncmp(cqueue, "\r\n\r\n", 4) == 0) {
+ endOfHeaders = true;
+ break;
+ }
+
+ // If the last character was a newline, and the first now is not
+ // whitespace, then rewind one character and store the current
+ // key,value pair.
+ if (cqueue[2] == '\n' && c != ' ' && c != '\t') {
+ if (content.length() > 2)
+ content.resize(content.length() - 2);
+
+ trim(content);
+ header->add(name, content);
+
+ if (c != '\r') {
+ mimeSource->ungetChar();
+ if (c == '\n') --*nlines;
+ return true;
+ }
+
+ mimeSource->getChar(&c);
+ return false;
+ }
+
+ content += c;
+ }
+
+ if (name != "") {
+ if (content.length() > 2)
+ content.resize(content.length() - 2);
+ header->add(name, content);
+ }
+
+ return !(eof || endOfHeaders);
+}
+
+//------------------------------------------------------------------------
+static void parseHeader(Binc::Header *header, unsigned int *nlines)
+{
+ while (parseOneHeaderLine(header, nlines))
+ { }
+}
+
+//------------------------------------------------------------------------
+static void analyzeHeader(Binc::Header *header, bool *multipart,
+ bool *messagerfc822, string *subtype, string *boundary)
+{
+ using namespace ::Binc;
+
+ // Do simple parsing of headers to determine the
+ // type of message (multipart,messagerfc822 etc)
+ HeaderItem ctype;
+ if (header->getFirstHeader("content-type", ctype)) {
+ vector<string> types;
+ split(ctype.getValue(), ";", types);
+
+ if (types.size() > 0) {
+ // first element should describe content type
+ string tmp = types[0];
+ trim(tmp);
+ vector<string> v;
+ split(tmp, "/", v);
+ string key, value;
+
+ key = (v.size() > 0) ? v[0] : "text";
+ value = (v.size() > 1) ? v[1] : "plain";
+ lowercase(key);
+
+ if (key == "multipart") {
+ *multipart = true;
+ lowercase(value);
+ *subtype = value;
+ } else if (key == "message") {
+ lowercase(value);
+ if (value == "rfc822")
+ *messagerfc822 = true;
+ }
+ }
+
+ for (vector<string>::const_iterator i = types.begin();
+ i != types.end(); ++i) {
+ string element = *i;
+ trim(element);
+
+ if (element.find("=") != string::npos) {
+ string::size_type pos = element.find('=');
+ string key = element.substr(0, pos);
+ string value = element.substr(pos + 1);
+
+ lowercase(key);
+ trim(key);
+
+ if (key == "boundary") {
+ trim(value, " \"");
+ *boundary = value;
+ }
+ }
+ }
+ }
+}
+
+static void parseMessageRFC822(vector<Binc::MimePart> *members,
+ bool *foundendofpart,
+ unsigned int *bodylength,
+ unsigned int *nbodylines,
+ const string &toboundary)
+{
+ using namespace ::Binc;
+
+ // message rfc822 means a completely enclosed mime document. we
+ // call the parser recursively, and pass on the boundary string
+ // that we got. when parse() finds this boundary, it returns 0. if
+ // it finds the end boundary (boundary + "--"), it returns != 0.
+ MimePart m;
+
+ unsigned int bodystartoffsetcrlf = mimeSource->getOffset();
+
+ // parsefull returns the number of bytes that need to be removed
+ // from the body because of the terminating boundary string.
+ int bsize = 0;
+ if (m.parseFull(toboundary, bsize))
+ *foundendofpart = true;
+
+ // make sure bodylength doesn't overflow
+ *bodylength = mimeSource->getOffset();
+ if (*bodylength >= bodystartoffsetcrlf) {
+ *bodylength -= bodystartoffsetcrlf;
+ if (*bodylength >= (unsigned int) bsize) {
+ *bodylength -= (unsigned int) bsize;
+ } else {
+ *bodylength = 0;
+ }
+ } else {
+ *bodylength = 0;
+ }
+
+ *nbodylines += m.getNofLines();
+
+ members->push_back(m);
+}
+
+static bool skipUntilBoundary(const string &delimiter,
+ unsigned int *nlines, bool *eof)
+{
+ int endpos = delimiter.length();
+ char *delimiterqueue = 0;
+ int delimiterpos = 0;
+ const char *delimiterStr = delimiter.c_str();
+ if (delimiter != "") {
+ delimiterqueue = new char[endpos];
+ memset(delimiterqueue, 0, endpos);
+ }
+
+ // first, skip to the first delimiter string. Anything between the
+ // header and the first delimiter string is simply ignored (it's
+ // usually a text message intended for non-mime clients)
+ char c;
+
+ bool foundBoundary = false;
+ for (;;) {
+ if (!mimeSource->getChar(&c)) {
+ *eof = true;
+ break;
+ }
+
+ if (c == '\n') ++*nlines;
+
+ // if there is no delimiter, we just read until the end of the
+ // file.
+ if (!delimiterqueue) continue;
+
+ delimiterqueue[delimiterpos++ % endpos] = c;
+
+ if (compareStringToQueue(delimiterStr, delimiterqueue,
+ delimiterpos, endpos)) {
+ foundBoundary = true;
+ break;
+ }
+ }
+
+ delete[] delimiterqueue;
+ delimiterqueue = 0;
+
+ return foundBoundary;
+}
+
+
+static void parseMultipart(const string &boundary,
+ const string &toboundary,
+ bool *eof,
+ unsigned int *nlines,
+ int *boundarysize,
+ bool *foundendofpart,
+ unsigned int *bodylength,
+ vector<Binc::MimePart> *members)
+{
+ using namespace ::Binc;
+ unsigned int bodystartoffsetcrlf = mimeSource->getOffset();
+
+ // multipart parsing starts with skipping to the first
+ // boundary. then we call parse() for all parts. the last parse()
+ // command will return a code indicating that it found the last
+ // boundary of this multipart. Note that the first boundary does
+ // not have to start with CRLF.
+ string delimiter = "--" + boundary;
+
+ skipUntilBoundary(delimiter, nlines, eof);
+
+ if (!eof) *boundarysize = delimiter.size();
+
+ // Read two more characters. This may be CRLF, it may be "--" and
+ // it may be any other two characters.
+
+ char a;
+ if (!mimeSource->getChar(&a)) *eof = true;
+ if (a == '\n') ++*nlines;
+
+ char b;
+ if (!mimeSource->getChar(&b)) *eof = true;
+ if (b == '\n') ++*nlines;
+
+ // If we find two dashes after the boundary, then this is the end
+ // of boundary marker.
+ if (!*eof) {
+ if (a == '-' && b == '-') {
+ *foundendofpart = true;
+ *boundarysize += 2;
+
+ if (!mimeSource->getChar(&a)) *eof = true;
+ if (a == '\n') ++*nlines;
+ if (!mimeSource->getChar(&b)) *eof = true;
+ if (b == '\n') ++*nlines;
+ }
+
+ if (a == '\r' && b == '\n') {
+ // This exception is to handle a special case where the
+ // delimiter of one part is not followed by CRLF, but
+ // immediately followed by a CRLF prefixed delimiter.
+ if (!mimeSource->getChar(&a) || !mimeSource->getChar(&b))
+ *eof = true;
+ else if (a == '-' && b == '-') {
+ mimeSource->ungetChar();
+ mimeSource->ungetChar();
+ mimeSource->ungetChar();
+ mimeSource->ungetChar();
+ } else {
+ mimeSource->ungetChar();
+ mimeSource->ungetChar();
+ }
+
+ *boundarysize += 2;
+ } else {
+ mimeSource->ungetChar();
+ mimeSource->ungetChar();
+ }
+ }
+
+ // read all mime parts.
+ if (!*foundendofpart && !*eof) {
+ bool quit = false;
+ do {
+ MimePart m;
+
+ // If parseFull returns != 0, then it encountered the multipart's
+ // final boundary.
+ int bsize = 0;
+ if (m.parseFull(boundary, bsize)) {
+ quit = true;
+ *boundarysize = bsize;
+ }
+
+ members->push_back(m);
+
+ } while (!quit);
+ }
+
+ if (!*foundendofpart && !*eof) {
+ // multipart parsing starts with skipping to the first
+ // boundary. then we call parse() for all parts. the last parse()
+ // command will return a code indicating that it found the last
+ // boundary of this multipart. Note that the first boundary does
+ // not have to start with CRLF.
+ string delimiter = "\r\n--" + toboundary;
+
+ skipUntilBoundary(delimiter, nlines, eof);
+
+ if (!*eof) *boundarysize = delimiter.size();
+
+ // Read two more characters. This may be CRLF, it may be "--" and
+ // it may be any other two characters.
+
+ char a = '\0';
+ if (!mimeSource->getChar(&a)) *eof = true;
+ if (a == '\n') ++*nlines;
+
+ char b = '\0';
+ if (!mimeSource->getChar(&b)) *eof = true;
+ if (b == '\n') ++*nlines;
+
+ // If we find two dashes after the boundary, then this is the end
+ // of boundary marker.
+ if (!*eof) {
+ if (a == '-' && b == '-') {
+ *foundendofpart = true;
+ *boundarysize += 2;
+ if (!mimeSource->getChar(&a))
+ *eof = true;
+ if (a == '\n')
+ ++*nlines;
+ if (!mimeSource->getChar(&b))
+ *eof = true;
+ if (b == '\n')
+ ++*nlines;
+ }
+
+ if (a == '\r' && b == '\n') {
+ // This exception is to handle a special case where the
+ // delimiter of one part is not followed by CRLF, but
+ // immediately followed by a CRLF prefixed delimiter.
+ if (!mimeSource->getChar(&a) || !mimeSource->getChar(&b))
+ *eof = true;
+ else if (a == '-' && b == '-') {
+ mimeSource->ungetChar();
+ mimeSource->ungetChar();
+ mimeSource->ungetChar();
+ mimeSource->ungetChar();
+ } else {
+ mimeSource->ungetChar();
+ mimeSource->ungetChar();
+ }
+
+ *boundarysize += 2;
+ } else {
+ mimeSource->ungetChar();
+ mimeSource->ungetChar();
+ }
+ }
+ }
+
+ // make sure bodylength doesn't overflow
+ *bodylength = mimeSource->getOffset();
+ if (*bodylength >= bodystartoffsetcrlf) {
+ *bodylength -= bodystartoffsetcrlf;
+ if (*bodylength >= (unsigned int) *boundarysize) {
+ *bodylength -= (unsigned int) *boundarysize;
+ } else {
+ *bodylength = 0;
+ }
+ } else {
+ *bodylength = 0;
+ }
+}
+
+static void parseSinglePart(const string &toboundary,
+ int *boundarysize,
+ unsigned int *nbodylines,
+ unsigned int *nlines,
+ bool *eof, bool *foundendofpart,
+ unsigned int *bodylength)
+{
+ using namespace ::Binc;
+ unsigned int bodystartoffsetcrlf = mimeSource->getOffset();
+
+ // If toboundary is empty, then we read until the end of the
+ // file. Otherwise we will read until we encounter toboundary.
+ string _toboundary;
+ if (toboundary != "") {
+ _toboundary = "\r\n--";
+ _toboundary += toboundary;
+ }
+
+ // if (skipUntilBoundary(_toboundary, nlines, eof))
+ // *boundarysize = _toboundary.length();
+
+ char *boundaryqueue = 0;
+ int endpos = _toboundary.length();
+ if (toboundary != "") {
+ boundaryqueue = new char[endpos];
+ memset(boundaryqueue, 0, endpos);
+ }
+ int boundarypos = 0;
+
+ *boundarysize = 0;
+
+ const char *_toboundaryStr = _toboundary.c_str();
+ string line;
+ bool toboundaryIsEmpty = (toboundary == "");
+ char c;
+ while (mimeSource->getChar(&c)) {
+ if (c == '\n') { ++*nbodylines; ++*nlines; }
+ if (toboundaryIsEmpty) continue;
+
+ // find boundary
+ boundaryqueue[boundarypos++ % endpos] = c;
+
+ if (compareStringToQueue(_toboundaryStr, boundaryqueue,
+ boundarypos, endpos)) {
+ *boundarysize = _toboundary.length();
+ break;
+ }
+ }
+
+ delete[] boundaryqueue;
+
+ if (toboundary != "") {
+
+ char a;
+ if (!mimeSource->getChar(&a)) *eof = true;
+ if (a == '\n') ++*nlines;
+
+ char b;
+ if (!mimeSource->getChar(&b)) *eof = true;
+ if (b == '\n') ++*nlines;
+
+ if (a == '-' && b == '-') {
+ *boundarysize += 2;
+ *foundendofpart = true;
+ if (!mimeSource->getChar(&a)) *eof = true;
+ if (a == '\n') ++*nlines;
+ if (!mimeSource->getChar(&b)) *eof = true;
+ if (b == '\n') ++*nlines;
+ }
+
+ if (a == '\r' && b == '\n') {
+ // This exception is to handle a special case where the
+ // delimiter of one part is not followed by CRLF, but
+ // immediately followed by a CRLF prefixed delimiter.
+ if (!mimeSource->getChar(&a) || !mimeSource->getChar(&b))
+ *eof = true;
+ else if (a == '-' && b == '-') {
+ mimeSource->ungetChar();
+ mimeSource->ungetChar();
+ mimeSource->ungetChar();
+ mimeSource->ungetChar();
+ } else {
+ mimeSource->ungetChar();
+ mimeSource->ungetChar();
+ }
+
+ *boundarysize += 2;
+ } else {
+ mimeSource->ungetChar();
+ mimeSource->ungetChar();
+ }
+ }
+
+ // make sure bodylength doesn't overflow
+ *bodylength = mimeSource->getOffset();
+ if (*bodylength >= bodystartoffsetcrlf) {
+ *bodylength -= bodystartoffsetcrlf;
+ if (*bodylength >= (unsigned int) *boundarysize) {
+ *bodylength -= (unsigned int) *boundarysize;
+ } else {
+ *bodylength = 0;
+ }
+ } else {
+ *bodylength = 0;
+ }
+
+}
+
+//------------------------------------------------------------------------
+int Binc::MimePart::parseFull(const string &toboundary,
+ int &boundarysize) const
+{
+ headerstartoffsetcrlf = mimeSource->getOffset();
+
+ // Parse the header of this mime part.
+ parseHeader(&h, &nlines);
+
+ // Headerlength includes the seperating CRLF. Body starts after the
+ // CRLF.
+ headerlength = mimeSource->getOffset() - headerstartoffsetcrlf;
+ bodystartoffsetcrlf = mimeSource->getOffset();
+ bodylength = 0;
+
+ // Determine the type of mime part by looking at fields in the
+ // header.
+ analyzeHeader(&h, &multipart, &messagerfc822, &subtype, &boundary);
+
+ bool eof = false;
+ bool foundendofpart = false;
+
+ if (messagerfc822) {
+ parseMessageRFC822(&members, &foundendofpart, &bodylength,
+ &nbodylines, toboundary);
+
+ } else if (multipart) {
+ parseMultipart(boundary, toboundary, &eof, &nlines, &boundarysize,
+ &foundendofpart, &bodylength, &members);
+ } else {
+ parseSinglePart(toboundary, &boundarysize, &nbodylines, &nlines,
+ &eof, &foundendofpart, &bodylength);
+ }
+
+ return (eof || foundendofpart) ? 1 : 0;
+}