play/play/utils/UriEncoding.scala

/*
 * Copyright (C) 2009-2015 Typesafe Inc. <http://www.typesafe.com>
 */
package play.utils

import java.util.BitSet
import java.io.ByteArrayOutputStream

/**
 * Provides support for correctly encoding pieces of URIs.
 *
 * @see http://www.ietf.org/rfc/rfc3986.txt
 */
object UriEncoding {

  /**
   * Encode a string so that it can be used safely in the "path segment"
   * part of a URI. A path segment is defined in RFC 3986. In a URI such
   * as `http://www.example.com/abc/def?a=1&b=2` both `abc` and `def`
   * are path segments.
   *
   * Path segment encoding differs from encoding for other parts of a URI.
   * For example, the "&" character is permitted in a path segment, but
   * has special meaning in query parameters. On the other hand, the "/"
   * character cannot appear in a path segment, as it is the path delimiter,
   * so it must be encoded as "%2F". These are just two examples of the
   * differences between path segment and query string encoding; there are
   * other differences too.
   *
   * When encoding path segments the `encodePathSegment` method should always
   * be used in preference to the [[java.net.URLEncoder.encode(String,String)]]
   * method. `URLEncoder.encode`, despite its name, actually provides encoding
   * in the `application/x-www-form-urlencoded` MIME format which is the encoding
   * used for form data in HTTP GET and POST requests. This encoding is suitable
   * for inclusion in the query part of a URI. But `URLEncoder.encode` should not
   * be used for path segment encoding. (Also note that `URLEncoder.encode` is
   * not quite spec compliant. For example, it percent-encodes the `~` character when
   * really it should leave it as unencoded.)
   *
   * @param s The string to encode.
   * @param inputCharset The name of the encoding that the string `s` is encoded with.
   *     The string `s` will be converted to octets (bytes) using this character encoding.
   * @return An encoded string in the US-ASCII character set.
   */
  def encodePathSegment(s: String, inputCharset: String): String = {
    val in = s.getBytes(inputCharset)
    val out = new ByteArrayOutputStream()
    for (b <- in) {
      val allowed = segmentChars.get(b & 0xFF)
      if (allowed) {
        out.write(b)
      } else {
        out.write('%')
        out.write(upperHex((b >> 4) & 0xF))
        out.write(upperHex(b & 0xF))
      }
    }
    out.toString("US-ASCII")
  }

  /**
   * Decode a string according to the rules for the "path segment"
   * part of a URI. A path segment is defined in RFC 3986. In a URI such
   * as `http://www.example.com/abc/def?a=1&b=2` both `abc` and `def`
   * are path segments.
   *
   * Path segment encoding differs from encoding for other parts of a URI.
   * For example, the "&" character is permitted in a path segment, but
   * has special meaning in query parameters. On the other hand, the "/"
   * character cannot appear in a path segment, as it is the path delimiter,
   * so it must be encoded as "%2F". These are just two examples of the
   * differences between path segment and query string encoding; there are
   * other differences too.
   *
   * When decoding path segments the `decodePathSegment` method should always
   * be used in preference to the [[java.net.URLDecoder.decode(String,String)]]
   * method. `URLDecoder.decode`, despite its name, actually decodes
   * the `application/x-www-form-urlencoded` MIME format which is the encoding
   * used for form data in HTTP GET and POST requests. This format is suitable
   * for inclusion in the query part of a URI. But `URLDecoder.decoder` should not
   * be used for path segment encoding or decoding.
   *
   * @param s The string to decode. Must use the US-ASCII character set.
   * @param outputCharset The name of the encoding that the output should be encoded with.
   *     The output string will be converted from octets (bytes) using this character encoding.
   * @throws InvalidEncodingException If the input is not a valid encoded path segment.
   * @return A decoded string in the `outputCharset` character set.
   */
  def decodePathSegment(s: String, outputCharset: String): String = {
    val in = s.getBytes("US-ASCII")
    val out = new ByteArrayOutputStream()
    var inPos = 0
    def next(): Int = {
      val b = in(inPos) & 0xFF
      inPos += 1
      b
    }
    while (inPos < in.length) {
      val b = next()
      if (b == '%') {
        // Read high digit
        if (inPos >= in.length) throw new InvalidUriEncodingException(s"Cannot decode $s: % at end of string")
        val high = fromHex(next())
        if (high == -1) throw new InvalidUriEncodingException(s"Cannot decode $s: expected hex digit at position $inPos.")
        // Read low digit
        if (inPos >= in.length) throw new InvalidUriEncodingException(s"Cannot decode $s: incomplete percent encoding at end of string")
        val low = fromHex(next())
        if (low == -1) throw new InvalidUriEncodingException(s"Cannot decode $s: expected hex digit at position $inPos.")
        // Write decoded byte
        out.write((high << 4) + low)
      } else if (segmentChars.get(b)) {
        // This character is allowed
        out.write(b)
      } else {
        throw new InvalidUriEncodingException(s"Cannot decode $s: illegal character at position $inPos.")
      }
    }
    out.toString(outputCharset)
  }

  /**
   * Decode the path path of a URI. Each path segment will be decoded
   * using the same rules as ``decodePathSegment``. No normalization is performed:
   * leading, trailing and duplicated slashes, if present are left as they are and
   * if absent remain absent; dot-segments (".." and ".") are ignored.
   *
   * Encoded slash characters are will appear as slashes in the output, thus "a/b"
   * will be indistinguishable from "a%2Fb".
   *
   * @param s The string to decode. Must use the US-ASCII character set.
   * @param outputCharset The name of the encoding that the output should be encoded with.
   *     The output string will be converted from octets (bytes) using this character encoding.
   * @throws InvalidEncodingException If the input is not a valid encoded path.
   * @return A decoded string in the `outputCharset` character set.
   */
  def decodePath(s: String, outputCharset: String): String = {
    // Note: Could easily expose a method to return the decoded path as a Seq[String].
    // This would allow better handling of paths segments with encoded slashes in them.
    // However, there is no need for this yet, so the method hasn't been added yet.
    splitString(s, '/').map(decodePathSegment(_, outputCharset)).mkString("/")
  }

  // RFC 3986, 3.3. Path
  // segment       = *pchar
  // segment-nz    = 1*pchar
  // segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" )
  //               ; non-zero-length segment without any colon ":"
  /** The set of ASCII character codes that are allowed in a URI path segment. */
  private val segmentChars: BitSet = membershipTable(pchar)

  /** The characters allowed in a path segment; defined in RFC 3986 */
  private def pchar: Seq[Char] = {
    // RFC 3986, 2.3. Unreserved Characters
    // unreserved  = ALPHA / DIGIT / "-" / "." / "_" / "~"
    val alphaDigit = for ((min, max) <- Seq(('a', 'z'), ('A', 'Z'), ('0', '9')); c <- min to max) yield c
    val unreserved = alphaDigit ++ Seq('-', '.', '_', '~')

    // RFC 3986, 2.2. Reserved Characters
    // sub-delims  = "!" / "$" / "&" / "'" / "(" / ")"
    //             / "*" / "+" / "," / ";" / "="
    val subDelims = Seq('!', '$', '&', '\'', '(', ')', '*', '+', ',', ';', '=')

    // RFC 3986, 3.3. Path
    // pchar         = unreserved / pct-encoded / sub-delims / ":" / "@"
    unreserved ++ subDelims ++ Seq(':', '@')
  }

  /** Create a BitSet to act as a membership lookup table for the given characters. */
  private def membershipTable(chars: Seq[Char]): BitSet = {
    val bits = new BitSet(256)
    for (c <- chars) { bits.set(c.toInt) }
    bits
  }

  /**
   * Given a number from 0 to 16, return the ASCII character code corresponding
   * to its uppercase hexadecimal representation.
   */
  private def upperHex(x: Int): Int = {
    // Assume 0 <= x < 16
    if (x < 10) (x + '0') else (x - 10 + 'A')
  }

  /**
   * Given the ASCII value of a character, return its value as a hex digit.
   * If the character isn't a valid hex digit, return -1 instead.
   */
  private def fromHex(b: Int): Int = {
    if (b >= '0' && b <= '9') {
      b - '0'
    } else if (b >= 'A' && b <= 'Z') {
      10 + b - 'A'
    } else if (b >= 'a' && b <= 'z') {
      10 + b - 'a'
    } else {
      -1
    }
  }

  /**
   * Split a string on a character. Similar to `String.split` except, for this method,
   * the invariant {{{splitString(s, '/').mkString("/") == s}}} holds.
   *
   * For example:
   * {{{
   * splitString("//a//", '/') == Seq("", "", "a", "", "")
   * String.split("//a//", '/') == Seq("", "", "a")
   * }}}
   */
  private[utils] def splitString(s: String, c: Char): Seq[String] = {
    val result = scala.collection.mutable.ListBuffer.empty[String]
    import scala.annotation.tailrec
    @tailrec
    def splitLoop(start: Int): Unit = if (start < s.length) {
      var end = s.indexOf(c, start)
      if (end == -1) {
        result += s.substring(start)
      } else {
        result += s.substring(start, end)
        splitLoop(end + 1)
      }
    } else if (start == s.length) {
      result += ""
    }
    splitLoop(0)
    result
  }

}

/**
 * An error caused by processing a value that isn't encoded correctly.
 */
class InvalidUriEncodingException(msg: String) extends RuntimeException(msg)