Harry: RegExps für URLs

Beitrag lesen

Holladiewaldfee,

Suche nach p r o s p e r o (ohne Leerzeichen, ich will die Archivsuche nicht verschmutzen) um ihn zu finden.

Also, nachdem Archivsuche und Archiv grade einen Hänger haben (herzlichen Dank an die bescheuerten Scriptkiddies) poste ich hier einfach mal aus einem von meinen Projekte raus den kompletten Ausdruck. Kann sein, daß da noch'n paar Konstanten von mir rumfliegen, wenn ja sollten sie aber selbsterklären sein. Der "große" Unterschied zu dem Ausdruck im Archiv ist, daß dieser hier theoretisch auch IPv6-Adressen verarbeiten kann, allerdings hat er da noch Schwächen. Aber nachdem das im Endeffekt ja eh bloß Spielerei ist und man da wirklich mit Kanonen auf Spatzen (Spatzen? Mücken!) schiesst, sollte es reichen:


# Original in Perl    $lcexp_lowalpha       =  "(?:[a-z])";    $lcexp_hialpha        =  "(?:[A-Z])";    $lcexp_alpha          =  "(?:".$lcexp_lowalpha."|".$lcexp_hialpha.")";    $lcexp_digit          =  "(?:\d)";    $lcexp_safe           =  "(?:[$_.+-])";    $lcexp_extra          =  "(?:[!*'(),])";    $lcexp_national       =  "(?:[{}|\\^~[]`])";    $lcexp_punctuation    =  "(?:[<>#%"])";    $lcexp_reserved       =  "(?:[;/?:@&=])";    $lcexp_hex            =  "(?:[\dA-Fa-f])";    $lcexp_escape         =  "(?:%".$lcexp_hex.$lcexp_hex.")";    $lcexp_unreserved     =  "(?:".$lcexp_alpha."|".$lcexp_digit."|".$lcexp_safe."|".$lcexp_extra.")";    $lcexp_uchar          =  "(?:".$lcexp_unreserved."|".$lcexp_escape.")";    $lcexp_xchar          =  "(?:".$lcexp_unreserved."|".$lcexp_escape."|".$lcexp_reserved.")";    $lcexp_digits         =  "(?:\d+)";    $lcexp_alphadigit     =  "(?:".$lcexp_alpha."|\d)";

# URL schemeparts for ip based protocols: (IP4 and IP6)    $lcexp_urlpath        =  "(?:".$lcexp_xchar.")";    $lcexp_user           =  "(?:(?:".$lcexp_uchar."|[;?&=]))";    $lcexp_password       =  "(?:(?:".$lcexp_uchar."|[;?&=]))";    $lcexp_port           =  "(?:[0-5]?\d\d?\d?\d?|6[0-4]\d\d\d|65[0-4]\d\d|655[0-2]\d|6553[0-5])";  #  $lcexp_ip4part        =  "(?:[01]?\d\d?|2[0-4]\d|25[0-5])";    $lcexp_domain_ip4_part  = "([0-9]|([1-9][0-9])|([1][0-9]{2})|(2))";    $lcexp_domain_ip4_part_last = "([1-9]|([1-9][0-9])|([1][0-9]{2})|(2))";    $lcexp_domain_ip4    = "((".$lcexp_domain_ip4_part."\.){3}".$lcexp_domain_ip4_part_last.")";    $lcexp_domain_ip6_part  = "(0|([1-9a-fA-F][0-9a-fA-F]{0,3}))";    $lcexp_domain_ip6_pure  = "((".$lcexp_domain_ip6_part.":){7}".$lcexp_domain_ip6_part.")";    $lcexp_domain_ip6_short  = "((((".$lcexp_domain_ip6_part.":){1,6})|(:)):((".$lcexp_domain_ip6_part.":){0,5})".$lcexp_domain_ip6_part.")";    $lcexp_domain_ip6   = "(".$lcexp_domain_ip6_pure."|".$lcexp_domain_ip6_short.")";    $lcexp_domain_ip6_4   = "((".$lcexp_domain_ip6_part.":){4}".$lcexp_domain_ip4.")";    $lcexp_domain_ip6_4_short = "(((".$lcexp_domain_ip6_part.":){1,3}|:):(".$lcexp_domain_ip6_part.":){0,2}".$lcexp_domain_ip4.")";    $lcexp_domain_ip    =  "(".$lcexp_domain_ip4."|".$lcexp_domain_ip6."|".$lcexp_domain_ip6_4."|".$lcexp_domain_ip6_4_short.")";    $lcexp_hostnumber     =  $lcexp_domain_ip;  #  $lcexp_hostnumber     =  "(?:(?!0+.0+.0+.0+)(?!255.255.255.255)".$lcexp_ip4part."\.".$lcexp_ip4part."\.".$lcexp_ip4part."\.".$lcexp_ip4part.")";    $lcexp_toplabel       =  "(?:(?:".$lcexp_alpha."(?:".$lcexp_alphadigit."|-)".$lcexp_alphadigit.")|".$lcexp_alpha.")";    $lcexp_domainlabel    =  "(?:(?:".$lcexp_alphadigit."(?:".$lcexp_alphadigit."|-)".$lcexp_alphadigit.")|".$lcexp_alphadigit.")";    $lcexp_hostname       =  "(?:(?:".$lcexp_domainlabel."\.)".$lcexp_toplabel.")";    $lcexp_host           =  "(?:(?:".$lcexp_hostname.")|(?:".$lcexp_hostnumber."))";    $lcexp_hostport       =  "(?:(?:".$lcexp_host.")(?::".$lcexp_port.")?)";    $lcexp_login          =  "(?:(?:".$lcexp_user."(?::".$lcexp_password.")?@)?".$lcexp_hostport.")";    $lcexp_ip_schemepart  =  "(?://".$lcexp_login."(?:/".$lcexp_urlpath.")?)";

$lcexp_schemepart     =  "(?:".$lcexp_xchar."*|".$lcexp_ip_schemepart.")";    $lcexp_scheme         =  "(?:(?:".$lcexp_lowalpha."|".$lcexp_digit."|[+.-])+)";

# The predefined schemes:

# FTP (see also RFC959)    $lcexp_fsegment       =  "(?:(?:".$lcexp_uchar."|[?:@&=]))";    $lcexp_ftptype        =  "(?:[AIDaid])";    $lcexp_fpath          =  "(?:".$lcexp_fsegment."(?:/".$lcexp_fsegment."))";    $lcexp_ftpurl         =  "(?:ftp://".$lcexp_login."(?:/".$lcexp_fpath."(?:;type=".$lcexp_ftptype.")?)?)";

# FILE    $lcexp_fileurl        =  "(?:file://(?:(?:".$lcexp_host.")|localhost)?/".$lcexp_fpath.")";

# HTTP    $lcexp_httpuchar      =  "(?:(?:".$lcexp_alpha."|".$lcexp_digit."|".$lcexp_safe."|(?:[!',]))|".$lcexp_escape.")";    $lcexp_hsegment       =  "(?:(?:".$lcexp_httpuchar."|[;:@&=~]))";    $lcexp_search         =  "(?:(?:".$lcexp_httpuchar."|[;:@&=~]))";    $lcexp_hpath          =  "(?:".$lcexp_hsegment."(?:/".$lcexp_hsegment."))";    $lcexp_httpurl        =  "(?:http://".$lcexp_hostport."(?:/".$lcexp_hpath."(?:\?".$lcexp_search.")?)?(?:#".$lcexp_xchar."*)?)";    $lcexp_strict_httpurl =  "(?:http://".$lcexp_hostport."(?:/".$lcexp_hpath."(?:\?".$lcexp_search.")?)?)";

# GOPHER (see also RFC1436)    $lcexp_gopher_plus    =  "(?:".$lcexp_xchar.")";    $lcexp_selector       =  "(?:".$lcexp_xchar.")";    $lcexp_gtype          =  "(?:".$lcexp_xchar.")";    $lcexp_gopherurl      =  "(?:gopher://".$lcexp_hostport."(?:/".$lcexp_gtype."(?:".$lcexp_selector."(?:%09".$lcexp_search."(?:%09".$lcexp_gopher_plus.")?)?)?)?)";

# NEWS (see also RFC1036)    $lcexp_article        =  "(?:(?:".$lcexp_uchar."|[;/?:&=])+@".$lcexp_host.")";    $lcexp_group          =  "(?:".$lcexp_alpha."(?:".$lcexp_alpha."|".$lcexp_digit."|[.+_-]))";    $lcexp_grouppart      =  "(?:".$lcexp_article."|".$lcexp_group."|\)";    $lcexp_newsurl        =  "(?:news:".$lcexp_grouppart.")";

# NNTP (see also RFC977)    $lcexp_nntpurl        =  "(?:nntp://".$lcexp_hostport."/".$lcexp_group."(?:/".$lcexp_digits.")?)";

# TELNET    $lcexp_telneturl      =  "(?:telnet://".$lcexp_login."(?:/)?)";

# WAIS (see also RFC1625)    $lcexp_wpath          =  "(?:".$lcexp_uchar.")";    $lcexp_wtype          =  "(?:".$lcexp_uchar.")";    $lcexp_database       =  "(?:".$lcexp_uchar."*)";    $lcexp_waisdoc        =  "(?:wais://".$lcexp_hostport."/".$lcexp_database."/".$lcexp_wtype."/".$lcexp_wpath.")";    $lcexp_waisindex      =  "(?:wais://".$lcexp_hostport."/".$lcexp_database."\?".$lcexp_search.")";    $lcexp_waisdatabase   =  "(?:wais://".$lcexp_hostport."/".$lcexp_database.")";  #  $lcexp_waisurl        =  "(?:".$lcexp_waisdatabase."|".$lcexp_waisindex."|".$lcexp_waisdoc.")";  #  Too many capturing parentheses ... bla ... but ... who the hell uses WAIS with IPv6 ???

# PROSPERO    $lcexp_fieldvalue     =  "(?:(?:".$lcexp_uchar."|[?:@&]))";    $lcexp_fieldname      =  "(?:(?:".$lcexp_uchar."|[?:@&]))";    $lcexp_fieldspec      =  "(?:;".$lcexp_fieldname."=".$lcexp_fieldvalue.")";    $lcexp_psegment       =  "(?:(?:".$lcexp_uchar."|[?:@&=]))";    $lcexp_ppath          =  "(?:".$lcexp_psegment."(?:/".$lcexp_psegment."))";    $lcexp_prosperourl    =  "(?:prospero://".$lcexp_hostport."/".$lcexp_ppath."(?:".$lcexp_fieldspec."))";

# Ergebnis    $check_array = Array(  "http"    => "/^".$lcexp_httpurl."$/",          "strict_http" => "/^".$lcexp_strict_httpurl."$/",          "ftp"   => "/^".$lcexp_ftpurl."$/",          "news"    => "/^".$lcexp_newsurl."$/",          "file"    => "/^".$lcexp_fileurl."$/",          "telnet"   => "/^".$lcexp_telneturl."$/",          "nntp"    => "/^".$lcexp_nntpurl."$/",          "gopher"   => "/^".$lcexp_gopherurl."$/",  #        "wais"    => "/^".$lcexp_waisurl."$/",          "wais_doc"  => "/^".$lcexp_waisdoc."$/",          "wais_idx"  => "/^".$lcexp_waisindex."$/",          "wais_db"   => "/^".$lcexp_waisdatabase."$/",          "prospero"   => "/^".$lcexp_prosperourl."$/");

# Relative Links    if(fcms_bLINKRFCALLOWRELATIVE == true)    { $check_array["hpath"] = "/^".$lcexp_hpath."$/";     $check_array["hsearch"] = "/^".$lcexp_hsearch."$/"; }

# Validierung    $validated = false;    while(list($protocol, $protregexp) = each($check_array))    { if(preg_match($protregexp, $data["link"]))     { $validated = true;      break; } # Einmal validiert reicht ... -> Spart Rechenleistung    }

Das war der erste Teil ...

Ciao,

Harry

--
  Man weiß erst was man hatte, wenn man es verloren hat.   42? Eher sh:| fo:) ch:] rl:° br:& n4:° ie:% mo:) va:) de:[ zu:) fl:( ss:) ls:[ js:|