diff --git a/ChangeLog b/ChangeLog index b47a0aec..ce76473f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,11 @@ +2010-02-19 Tatsuhiro Tsujikawa + + Added unit tests for util::getContentDispositionFilename() from + http://greenbytes.de/tech/tc2231/ Fixed the function so that added + tests are passed. + * src/util.cc + * test/UtilTest.cc + 2010-02-18 Tatsuhiro Tsujikawa Removed setlocale() for LC_CTYPE. It may affect isxdigit in diff --git a/src/util.cc b/src/util.cc index 1136c423..1470a12c 100644 --- a/src/util.cc +++ b/src/util.cc @@ -201,6 +201,21 @@ std::string replace(const std::string& target, const std::string& oldstr, const return result; } +bool isAlpha(const char c) +{ + return ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z'); +} + +bool isDigit(const char c) +{ + return '0' <= c && c <= '9'; +} + +bool isHexDigit(const char c) +{ + return isDigit(c) || ('A' <= c && c <= 'F') || ('a' <= c && c <= 'f'); +} + bool inRFC3986ReservedChars(const char c) { static const char reserved[] = { @@ -214,15 +229,34 @@ bool inRFC3986ReservedChars(const char c) bool inRFC3986UnreservedChars(const char c) { static const char unreserved[] = { '-', '.', '_', '~' }; - return - // ALPHA - ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z') || - // DIGIT - ('0' <= c && c <= '9') || + return isAlpha(c) || isDigit(c) || std::find(&unreserved[0], &unreserved[arrayLength(unreserved)], c) != &unreserved[arrayLength(unreserved)]; } +bool inRFC2978MIMECharset(const char c) +{ + static const char chars[] = { + '!', '#', '$', '%', '&', + '\'', '+', '-', '^', '_', + '`', '{', '}', '~' + }; + return isAlpha(c) || isDigit(c) || + std::find(&chars[0], &chars[arrayLength(chars)], c) != + &chars[arrayLength(chars)]; +} + +bool inRFC2616HttpToken(const char c) +{ + static const char chars[] = { + '!', '#', '$', '%', '&', '\'', '*', '+', '-', '.', + '^', '_', '`', '|', '~' + }; + return isAlpha(c) || isDigit(c) || + std::find(&chars[0], &chars[arrayLength(chars)], c) != + &chars[arrayLength(chars)]; +} + std::string urlencode(const unsigned char* target, size_t len) { std::string dest; for(size_t i = 0; i < len; ++i) { @@ -244,9 +278,7 @@ std::string urlencode(const std::string& target) std::string torrentUrlencode(const unsigned char* target, size_t len) { std::string dest; for(size_t i = 0; i < len; ++i) { - if(('0' <= target[i] && target[i] <= '9') || - ('A' <= target[i] && target[i] <= 'Z') || - ('a' <= target[i] && target[i] <= 'z')) { + if(isAlpha(target[i]) || isDigit(target[i])) { dest += target[i]; } else { dest.append(StringFormat("%%%02X", target[i]).str()); @@ -267,7 +299,7 @@ std::string urldecode(const std::string& target) { itr != target.end(); ++itr) { if(*itr == '%') { if(itr+1 != target.end() && itr+2 != target.end() && - isxdigit(*(itr+1)) && isxdigit(*(itr+2))) { + isHexDigit(*(itr+1)) && isHexDigit(*(itr+2))) { result += parseInt(std::string(itr+1, itr+3), 16); itr += 2; } else { @@ -614,12 +646,16 @@ static std::string trimBasename(const std::string& src) { static const std::string TRIMMED("\r\n\t '\""); std::string fn = File(trim(src, TRIMMED)).getBasename(); + std::string::iterator enditer = std::remove(fn.begin(), fn.end(), '\\'); + fn = std::string(fn.begin(), enditer); if(fn == ".." || fn == A2STR::DOT_C) { fn = A2STR::NIL; } return fn; } +// Converts ISO/IEC 8859-1 string to UTF-8 string. If there is a +// character not in ISO/IEC 8859-1, returns empty string. std::string iso8859ToUtf8(const std::string& src) { std::string dest; @@ -632,6 +668,8 @@ std::string iso8859ToUtf8(const std::string& src) dest += 0xc3; } dest += c&(~0x40); + } else if(0x80 <= c && c <= 0x9f) { + return A2STR::NIL; } else { dest += c; } @@ -648,15 +686,74 @@ std::string getContentDispositionFilename(const std::string& header) i != params.end(); ++i) { std::string& param = *i; static const std::string keyName = "filename"; - if(!startsWith(param, keyName)) { + if(!startsWith(toLower(param), keyName) || param.size() == keyName.size()) { continue; } std::string::iterator markeritr = param.begin()+keyName.size(); - for(; markeritr != param.end() && *markeritr == ' '; ++markeritr); - if(markeritr == param.end()) { - continue; - } - if(*markeritr == '=') { + if(*markeritr == '*') { + // See RFC2231 Section4 and draft-reschke-rfc2231-in-http. + // Please note that this function doesn't do charset conversion + // except that if iso-8859-1 is specified, it is converted to + // utf-8. + ++markeritr; + for(; markeritr != param.end() && *markeritr == ' '; ++markeritr); + if(markeritr == param.end() || *markeritr != '=') { + continue; + } + std::pair paramPair; + split(paramPair, param, '='); + std::string value = paramPair.second; + std::vector extValues; + split(value, std::back_inserter(extValues), "'", false, true); + if(extValues.size() != 3) { + continue; + } + bool bad = false; + const std::string& charset = extValues[0]; + for(std::string::const_iterator j = charset.begin(); j != charset.end(); + ++j) { + // Since we first split parameter by ', we can safely assume + // that ' is not included in charset. + if(!inRFC2978MIMECharset(*j)) { + bad = true; + break; + } + } + if(bad) { + continue; + } + bad = false; + value = extValues[2]; + for(std::string::const_iterator j = value.begin(); j != value.end(); ++j){ + if(*j == '%') { + if(j+1 != value.end() && isHexDigit(*(j+1)) && + j+2 != value.end() && isHexDigit(*(j+2))) { + j += 2; + } else { + bad = true; + break; + } + } else { + if(*j == '*' || *j == '\'' || !inRFC2616HttpToken(*j)) { + bad = true; + break; + } + } + } + if(bad) { + continue; + } + value = trimBasename(urldecode(value)); + if(toLower(extValues[0]) == "iso-8859-1") { + value = iso8859ToUtf8(value); + } + filename = value; + break; + } else { + for(; markeritr != param.end() && *markeritr == ' '; ++markeritr); + if(markeritr == param.end() || *markeritr != '=') { + continue; + } std::pair paramPair; split(paramPair, param, '='); std::string value = paramPair.second; @@ -672,35 +769,9 @@ std::string getContentDispositionFilename(const std::string& header) } else { filenameLast = value.end(); } - value = trimBasename(std::string(value.begin(), filenameLast)); - if(value.empty()) { - continue; - } - filename = urldecode(value); - // continue because there is a chance we can find filename*=... - } else if(*markeritr == '*') { - // See RFC2231 Section4 and draft-reschke-rfc2231-in-http. - // Please note that this function doesn't do charset conversion - // except that if iso-8859-1 is specified, it is converted to - // utf-8. - std::pair paramPair; - split(paramPair, param, '='); - std::string value = paramPair.second; - std::vector extValues; - split(value, std::back_inserter(extValues), "'", false, true); - if(extValues.size() != 3) { - continue; - } - value = trimBasename(extValues[2]); - if(value.empty()) { - continue; - } - value = urldecode(value); - if(extValues[0] == "iso-8859-1") { - value = iso8859ToUtf8(value); - } + value = trimBasename(urldecode(std::string(value.begin(), filenameLast))); filename = value; - break; + // continue because there is a chance we can find filename*=... } } return filename; diff --git a/test/UtilTest.cc b/test/UtilTest.cc index d8379874..c42734c4 100644 --- a/test/UtilTest.cc +++ b/test/UtilTest.cc @@ -314,9 +314,6 @@ void UtilTest::testGetContentDispositionFilename() { CPPUNIT_ASSERT_EQUAL(std::string(), util::getContentDispositionFilename(currentDir)); // RFC2231 Section4 - std::string extparam1 = "attachment; filename * = UTF-8'ja'filename"; - CPPUNIT_ASSERT_EQUAL(std::string("filename"), - util::getContentDispositionFilename(extparam1)); std::string extparam2 = "filename*=''aria2"; CPPUNIT_ASSERT_EQUAL(std::string("aria2"), util::getContentDispositionFilename(extparam2)); @@ -338,12 +335,53 @@ void UtilTest::testGetContentDispositionFilename() { std::string extparam8 = "filename=aria2;filename*=UTF-8''hello%20world"; CPPUNIT_ASSERT_EQUAL(std::string("hello world"), util::getContentDispositionFilename(extparam8)); - std::string extparam9 = "filename*=iso-8859-1''%A3"; + std::string extparam9 = "filename*=ISO-8859-1''%A3"; std::string extparam9ans; extparam9ans += 0xc2; extparam9ans += 0xa3; CPPUNIT_ASSERT_EQUAL(extparam9ans, util::getContentDispositionFilename(extparam9)); + + // Tests from http://greenbytes.de/tech/tc2231/ + // attwithasciifnescapedchar + CPPUNIT_ASSERT_EQUAL + (std::string("foo.html"), + util::getContentDispositionFilename("filename=\"f\\oo.html\"")); + // attwithasciifilenameucase + CPPUNIT_ASSERT_EQUAL + (std::string("foo.html"), + util::getContentDispositionFilename("FILENAME=\"foo.html\"")); + // attwithisofn2231iso + CPPUNIT_ASSERT_EQUAL + (std::string("foo-ä.html"), + util::getContentDispositionFilename("filename*=iso-8859-1''foo-%E4.html")); + // attwithfn2231utf8 + CPPUNIT_ASSERT_EQUAL + (std::string("foo-ä-€.html"), + util::getContentDispositionFilename + ("filename*=UTF-8''foo-%c3%a4-%e2%82%ac.html")); + // attwithfn2231utf8-bad + CPPUNIT_ASSERT_EQUAL + (std::string(""), + util::getContentDispositionFilename + ("filename*=iso-8859-1''foo-%c3%a4-%e2%82%ac.html")); + // attwithfn2231ws1 + CPPUNIT_ASSERT_EQUAL + (std::string(""), + util::getContentDispositionFilename("filename *=UTF-8''foo-%c3%a4.html")); + // attwithfn2231ws2 + CPPUNIT_ASSERT_EQUAL + (std::string("foo-ä.html"), + util::getContentDispositionFilename("filename*= UTF-8''foo-%c3%a4.html")); + // attwithfn2231ws3 + CPPUNIT_ASSERT_EQUAL + (std::string("foo-ä.html"), + util::getContentDispositionFilename("filename* =UTF-8''foo-%c3%a4.html")); + // attwithfn2231quot + CPPUNIT_ASSERT_EQUAL + (std::string(""), + util::getContentDispositionFilename + ("filename*=\"UTF-8''foo-%c3%a4.html\"")); } class Printer {