blob: 7cb92883667ee4a9c742668ac73eef51fdf03ccf [file] [log] [blame]
<?php
/**
* Syntax based normalization of URI's
*
* This normalises URI's based on the specification RFC 3986
* http://www.apps.ietf.org/rfc/rfc3986.html
*
* Example usage:
* <code>
* require_once 'URLNormalizer.php';
*
* $url = 'eXAMPLE://a/./b/../b/%63/%7bfoo%7d';
* $un = new URLNormalizer();
* $un->setUrl( $url );
* echo $un->normalize();
*
* // result: "example://a/b/c/%7Bfoo%7D"
* </code>
*
* @author Glen Scott <glen_scott@yahoo.co.uk>
*/
class URLNormalizer {
private $url;
private $scheme;
private $host;
private $port;
private $user;
private $pass;
private $path;
private $query;
private $fragment;
private $default_scheme_ports = array( 'http' => 80, 'https' => 443, );
public function __construct( $url=null ) {
$this->scheme = '';
$this->host = '';
$this->port = '';
$this->user = '';
$this->pass = '';
$this->path = '';
$this->query = '';
$this->fragment = '';
if ( $url ) {
$this->setUrl( $url );
}
}
public function getUrl() {
return $this->url;
}
public function setUrl( $url ) {
$this->url = $url;
// parse URL into respective parts
$url_components = parse_url( $this->url );
if ( ! $url_components ) {
return false;
}
else {
foreach ( $url_components as $key => $value ) {
if ( property_exists( $this, $key ) ) {
$this->$key = $value;
}
}
return true;
}
}
public function getScheme() {
return $this->scheme;
}
public function normalize() {
if ( $this->path ) {
// case normalization
$this->path = preg_replace( '/(%([0-9abcdef][0-9abcdef]))/ex', "'%'.strtoupper('\\2')", $this->path );
// percent-encoding normalization
$this->path = $this->urlDecodeUnreservedChars( $this->path );
// path segment normalization
$this->path = $this->removeDotSegments( $this->path );
}
$scheme = '';
if ( $this->scheme ) {
$this->scheme = strtolower( $this->scheme );
$scheme = $this->scheme . '://';
}
if ( $this->host ) {
$this->host = strtolower( $this->host );
}
$this->schemeBasedNormalization();
// reconstruct uri
$query = '';
if ( $this->query ) {
$query = '?' . $this->query;
}
$fragment = '';
if ( $this->fragment ) {
$fragment = '#' . $this->fragment;
}
$port = '';
if ( $this->port ) {
$port = ':' . $this->port;
}
$authorization = '';
if ( $this->user ) {
$authorization = $this->user . ':' . $this->pass . '@';
}
return $scheme . $authorization . $this->host . $port . $this->path . $query . $fragment;
}
/**
* Decode unreserved characters
* http://www.apps.ietf.org/rfc/rfc3986.html#sec-2.3
*/
public function urlDecodeUnreservedChars( $string ) {
$unreserved = array();
for ( $octet = 65; $octet <= 90; $octet++ ) {
$unreserved[] = dechex( $octet );
}
for ( $octet = 97; $octet <= 122; $octet++ ) {
$unreserved[] = dechex( $octet );
}
for ( $octet = 48; $octet <= 57; $octet++ ) {
$unreserved[] = dechex( $octet );
}
$unreserved[] = dechex( ord( '-' ) );
$unreserved[] = dechex( ord( '.' ) );
$unreserved[] = dechex( ord( '_' ) );
$unreserved[] = dechex( ord( '~' ) );
return preg_replace_callback( array_map( create_function( '$str', 'return "/%" . strtoupper( $str ) . "/x";' ), $unreserved ), create_function( '$matches', 'return chr( hexdec( $matches[0] ));' ), $string );
//return chr( hexdec( '%63' ) );
}
/**
* Path segment normalization
* http://www.apps.ietf.org/rfc/rfc3986.html#sec-5.2.4
*/
public function removeDotSegments( $path ) {
$new_path = '';
$iteration = 0;
$step = ' ';
while ( ! empty( $path ) ) {
// A
$pattern_a = '!^(\.\./|\./)!x';
$pattern_b_1 = '!^(/\./)!x';
$pattern_b_2 = '!^(/\.)$!x';
$pattern_c = '!^(/\.\./|/\.\.)!x';
$pattern_d = '!^(\.|\.\.)$!x';
$pattern_e = '!(/*[^/]*)!x';
if ( preg_match( $pattern_a, $path ) ) {
// remove prefix from $path
$path = preg_replace( $pattern_a, '', $path );
}
elseif ( preg_match( $pattern_b_1, $path, $matches ) || preg_match( $pattern_b_2, $path, $matches ) ) {
$path = preg_replace( "!^" . $matches[1] . "!", '/', $path );
}
elseif ( preg_match( $pattern_c, $path, $matches ) ) {
$path = preg_replace( '!^' . preg_quote( $matches[1], '!' ) . '!x', '/', $path );
// remove the last segment and its preceding "/" (if any) from output buffer
$new_path = preg_replace( '!/([^/]+)$!x', '', $new_path );
}
elseif ( preg_match( $pattern_d, $path ) ) {
$path = preg_replace( $pattern_d, '', $path );
}
else {
if ( preg_match( $pattern_e, $path, $matches ) ) {
$first_path_segment = $matches[1];
$path = preg_replace( '/^' . preg_quote( $first_path_segment, '/' ) . '/', '', $path, 1 );
$new_path .= $first_path_segment;
}
}
}
return $new_path;
}
private function schemeBasedNormalization() {
if ( isset( $this->default_scheme_ports[$this->scheme] ) && $this->default_scheme_ports[$this->scheme] == $this->port ) {
$this->port = '';
}
}
}