2010-02-19 Tatsuhiro Tsujikawa <t-tujikawa@users.sourceforge.net>

Added unit tests for util::getContentDispositionFilename() from http://greenbytes.de/tech/tc2231/ Fixed the function so that added tests are passed. * src/util.cc * test/UtilTest.cc
2010-02-19 10:54:40 +00:00 · 2010-02-19 10:54:40 +00:00 · 780aaf9c80
parent cc056289e7
commit 780aaf9c80
3 changed files with 164 additions and 47 deletions
--- a/8
+++ b/8
@ -1,3 +1,11 @@
+2010-02-19  Tatsuhiro Tsujikawa  <t-tujikawa@users.sourceforge.net>
+
+	Added unit tests for util::getContentDispositionFilename() from
+	http://greenbytes.de/tech/tc2231/ Fixed the function so that added
+	tests are passed.
+	* src/util.cc
+	* test/UtilTest.cc
+
 2010-02-18  Tatsuhiro Tsujikawa  <t-tujikawa@users.sourceforge.net>

 	Removed setlocale() for LC_CTYPE. It may affect isxdigit in
--- a/src/util.cc
+++ b/src/util.cc
@ -201,6 +201,21 @@ std::string replace(const std::string& target, const std::string& oldstr, const
  return result;
 }

+bool isAlpha(const char c)
+{
+  return ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z');
+}
+
+bool isDigit(const char c)
+{
+  return '0' <= c && c <= '9';
+}
+
+bool isHexDigit(const char c)
+{
+  return isDigit(c) || ('A' <= c && c <= 'F') || ('a' <= c && c <= 'f');
+}
+
 bool inRFC3986ReservedChars(const char c)
 {
  static const char reserved[] = {
@ -214,15 +229,34 @@ bool inRFC3986ReservedChars(const char c)
 bool inRFC3986UnreservedChars(const char c)
 {
  static const char unreserved[] = { '-', '.', '_', '~' };
-  return
-    // ALPHA
-    ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z') ||
-    // DIGIT
-    ('0' <= c && c <= '9') ||
+  return isAlpha(c) || isDigit(c) ||
    std::find(&unreserved[0], &unreserved[arrayLength(unreserved)], c) !=
    &unreserved[arrayLength(unreserved)];
 }

+bool inRFC2978MIMECharset(const char c)
+{
+  static const char chars[] = {
+    '!', '#', '$', '%', '&',
+    '\'', '+', '-', '^', '_',
+    '`', '{', '}', '~'
+  };
+  return isAlpha(c) || isDigit(c) ||
+    std::find(&chars[0], &chars[arrayLength(chars)], c) !=
+    &chars[arrayLength(chars)];
+}
+
+bool inRFC2616HttpToken(const char c)
+{
+  static const char chars[] = {
+    '!', '#', '$', '%', '&', '\'', '*', '+', '-', '.',
+    '^', '_', '`', '|', '~'
+  };
+  return isAlpha(c) || isDigit(c) ||
+    std::find(&chars[0], &chars[arrayLength(chars)], c) !=
+    &chars[arrayLength(chars)];
+}
+
 std::string urlencode(const unsigned char* target, size_t len) {
  std::string dest;
  for(size_t i = 0; i < len; ++i) {
@ -244,9 +278,7 @@ std::string urlencode(const std::string& target)
 std::string torrentUrlencode(const unsigned char* target, size_t len) {
  std::string dest;
  for(size_t i = 0; i < len; ++i) {
-    if(('0' <= target[i] && target[i] <= '9') ||
-       ('A' <= target[i] && target[i] <= 'Z') ||
-       ('a' <= target[i] && target[i] <= 'z')) {
+    if(isAlpha(target[i]) || isDigit(target[i])) {
      dest += target[i];
    } else {
      dest.append(StringFormat("%%%02X", target[i]).str());
@ -267,7 +299,7 @@ std::string urldecode(const std::string& target) {
      itr != target.end(); ++itr) {
    if(*itr == '%') {
      if(itr+1 != target.end() && itr+2 != target.end() &&
-         isxdigit(*(itr+1)) && isxdigit(*(itr+2))) {
+         isHexDigit(*(itr+1)) && isHexDigit(*(itr+2))) {
        result += parseInt(std::string(itr+1, itr+3), 16);
        itr += 2;
      } else {
@ -614,12 +646,16 @@ static std::string trimBasename(const std::string& src)
 {
  static const std::string TRIMMED("\r\n\t '\"");
  std::string fn = File(trim(src, TRIMMED)).getBasename();
+  std::string::iterator enditer = std::remove(fn.begin(), fn.end(), '\\');
+  fn = std::string(fn.begin(), enditer);
  if(fn == ".." || fn == A2STR::DOT_C) {
    fn = A2STR::NIL;
  }
  return fn;
 }

+// Converts ISO/IEC 8859-1 string to UTF-8 string.  If there is a
+// character not in ISO/IEC 8859-1, returns empty string.
 std::string iso8859ToUtf8(const std::string& src)
 {
  std::string dest;
@ -632,6 +668,8 @@ std::string iso8859ToUtf8(const std::string& src)
        dest += 0xc3;
      }
      dest += c&(~0x40);
+    } else if(0x80 <= c && c <= 0x9f) {
+      return A2STR::NIL;
    } else {
      dest += c;
    }
@ -648,15 +686,74 @@ std::string getContentDispositionFilename(const std::string& header)
      i != params.end(); ++i) {
    std::string& param = *i;
    static const std::string keyName = "filename";
-    if(!startsWith(param, keyName)) {
+    if(!startsWith(toLower(param), keyName) || param.size() == keyName.size()) {
      continue;
    }
    std::string::iterator markeritr = param.begin()+keyName.size();
-    for(; markeritr != param.end() && *markeritr == ' '; ++markeritr);
-    if(markeritr == param.end()) {
-      continue;
-    }
-    if(*markeritr == '=') {
+    if(*markeritr == '*') {
+      // See RFC2231 Section4 and draft-reschke-rfc2231-in-http.
+      // Please note that this function doesn't do charset conversion
+      // except that if iso-8859-1 is specified, it is converted to
+      // utf-8.
+      ++markeritr;
+      for(; markeritr != param.end() && *markeritr == ' '; ++markeritr);
+      if(markeritr == param.end() || *markeritr != '=') {
+        continue;
+      }
+      std::pair<std::string, std::string> paramPair;
+      split(paramPair, param, '=');
+      std::string value = paramPair.second;
+      std::vector<std::string> extValues;
+      split(value, std::back_inserter(extValues), "'", false, true);
+      if(extValues.size() != 3) {
+        continue;
+      }
+      bool bad = false;
+      const std::string& charset = extValues[0];
+      for(std::string::const_iterator j = charset.begin(); j != charset.end();
+          ++j) {
+        // Since we first split parameter by ', we can safely assume
+        // that ' is not included in charset.
+        if(!inRFC2978MIMECharset(*j)) {
+          bad = true;
+          break;
+        }
+      }
+      if(bad) {
+        continue;
+      }
+      bad = false;
+      value = extValues[2];
+      for(std::string::const_iterator j = value.begin(); j != value.end(); ++j){
+        if(*j == '%') {
+          if(j+1 != value.end() && isHexDigit(*(j+1)) &&
+             j+2 != value.end() && isHexDigit(*(j+2))) {
+            j += 2;
+          } else {
+            bad = true;
+            break;
+          }
+        } else {
+          if(*j == '*' || *j == '\'' || !inRFC2616HttpToken(*j)) {
+            bad = true;
+            break;
+          }
+        }
+      }
+      if(bad) {
+        continue;
+      }
+      value = trimBasename(urldecode(value));
+      if(toLower(extValues[0]) == "iso-8859-1") {
+        value = iso8859ToUtf8(value);
+      }
+      filename = value;
+      break;
+    } else {
+      for(; markeritr != param.end() && *markeritr == ' '; ++markeritr);
+      if(markeritr == param.end() || *markeritr != '=') {
+        continue;
+      }
      std::pair<std::string, std::string> paramPair;
      split(paramPair, param, '=');
      std::string value = paramPair.second;
@ -672,35 +769,9 @@ std::string getContentDispositionFilename(const std::string& header)
      } else {
        filenameLast = value.end();
      }
-      value = trimBasename(std::string(value.begin(), filenameLast));
-      if(value.empty()) {
-        continue;
-      }
-      filename = urldecode(value);
-      // continue because there is a chance we can find filename*=...
-    } else if(*markeritr == '*') {
-      // See RFC2231 Section4 and draft-reschke-rfc2231-in-http.
-      // Please note that this function doesn't do charset conversion
-      // except that if iso-8859-1 is specified, it is converted to
-      // utf-8.
-      std::pair<std::string, std::string> paramPair;
-      split(paramPair, param, '=');
-      std::string value = paramPair.second;
-      std::vector<std::string> extValues;
-      split(value, std::back_inserter(extValues), "'", false, true);
-      if(extValues.size() != 3) {
-        continue;
-      }
-      value = trimBasename(extValues[2]);
-      if(value.empty()) {
-        continue;
-      }
-      value = urldecode(value);
-      if(extValues[0] == "iso-8859-1") {
-        value = iso8859ToUtf8(value);
-      }
+      value = trimBasename(urldecode(std::string(value.begin(), filenameLast)));
      filename = value;
-      break;
+      // continue because there is a chance we can find filename*=...
    }
  }
  return filename;
--- a/test/UtilTest.cc
+++ b/test/UtilTest.cc
@ -314,9 +314,6 @@ void UtilTest::testGetContentDispositionFilename() {
  CPPUNIT_ASSERT_EQUAL(std::string(),
                       util::getContentDispositionFilename(currentDir));
  // RFC2231 Section4
-  std::string extparam1 = "attachment; filename * = UTF-8'ja'filename";
-  CPPUNIT_ASSERT_EQUAL(std::string("filename"),
-                       util::getContentDispositionFilename(extparam1));
  std::string extparam2 = "filename*=''aria2";
  CPPUNIT_ASSERT_EQUAL(std::string("aria2"),
                       util::getContentDispositionFilename(extparam2));
@ -338,12 +335,53 @@ void UtilTest::testGetContentDispositionFilename() {
  std::string extparam8 = "filename=aria2;filename*=UTF-8''hello%20world";
  CPPUNIT_ASSERT_EQUAL(std::string("hello world"),
                       util::getContentDispositionFilename(extparam8));
-  std::string extparam9 = "filename*=iso-8859-1''%A3";
+  std::string extparam9 = "filename*=ISO-8859-1''%A3";
  std::string extparam9ans;
  extparam9ans += 0xc2;
  extparam9ans += 0xa3;
  CPPUNIT_ASSERT_EQUAL(extparam9ans,
                       util::getContentDispositionFilename(extparam9));
+
+  // Tests from http://greenbytes.de/tech/tc2231/
+  // attwithasciifnescapedchar
+  CPPUNIT_ASSERT_EQUAL
+    (std::string("foo.html"),
+     util::getContentDispositionFilename("filename=\"f\\oo.html\""));
+  // attwithasciifilenameucase
+  CPPUNIT_ASSERT_EQUAL
+    (std::string("foo.html"),
+     util::getContentDispositionFilename("FILENAME=\"foo.html\""));
+  // attwithisofn2231iso
+  CPPUNIT_ASSERT_EQUAL
+    (std::string("foo-ä.html"),
+     util::getContentDispositionFilename("filename*=iso-8859-1''foo-%E4.html"));
+  // attwithfn2231utf8
+  CPPUNIT_ASSERT_EQUAL
+    (std::string("foo-ä-€.html"),
+     util::getContentDispositionFilename
+     ("filename*=UTF-8''foo-%c3%a4-%e2%82%ac.html"));
+  // attwithfn2231utf8-bad
+  CPPUNIT_ASSERT_EQUAL
+    (std::string(""),
+     util::getContentDispositionFilename
+     ("filename*=iso-8859-1''foo-%c3%a4-%e2%82%ac.html"));
+  // attwithfn2231ws1
+  CPPUNIT_ASSERT_EQUAL
+    (std::string(""),
+     util::getContentDispositionFilename("filename *=UTF-8''foo-%c3%a4.html"));
+  // attwithfn2231ws2
+  CPPUNIT_ASSERT_EQUAL
+    (std::string("foo-ä.html"),
+     util::getContentDispositionFilename("filename*= UTF-8''foo-%c3%a4.html"));
+  // attwithfn2231ws3
+  CPPUNIT_ASSERT_EQUAL
+    (std::string("foo-ä.html"),
+     util::getContentDispositionFilename("filename* =UTF-8''foo-%c3%a4.html"));
+  // attwithfn2231quot
+  CPPUNIT_ASSERT_EQUAL
+    (std::string(""),
+     util::getContentDispositionFilename
+     ("filename*=\"UTF-8''foo-%c3%a4.html\""));
 }

 class Printer {