That's a lot for one regexp to do. You're looking at more elaborate parsing there.
This isn't a regexp, but a grammar. Pretty straightforward: "?" | ":" | "@" | "&" would be written in a regexp as [?:@&], for example, and ";" fieldname "=" fieldvalue means "a semicolon followed by a fieldname (defined elsewhere) followed by an equals sign followed by a fieldvalue (defined elsewhere).
It covers http, https, ftp, news, nntp, telnet, gopher, wais, mailto, file, prospero and generic urls. It only handles complete (full) URLs, not just "www.example.com", and does not check that protocl-specific things like email addresses or ftp paths are valid.
In other words - you're asking a fair bit 🙂
; The generic form of a URL is:
genericurl = scheme ":" schemepart
; Specific predefined schemes are defined here; new schemes
; may be registered with IANA
url = httpurl | httpsurl | ftpurl | newsurl |
nntpurl | telneturl | gopherurl |
waisurl | mailtourl | fileurl |
prosperourl | otherurl
; new schemes follow the general syntax
otherurl = genericurl
; the scheme is in lower case; interpreters should use case-ignore
scheme = ( lowalpha | digit | "+" | "-" | "." )+
schemepart = *xchar | ip-schemepart
; URL schemeparts for ip based protocols:
ip-schemepart = "//" login ( "/" urlpath )?
login = ( user ( ":" password )? "@" )? hostport
hostport = host ( ":" port )?
host = hostname | hostnumber
hostname = ( domainlabel "." )* toplabel
domainlabel = alphadigit | alphadigit ( alphadigit | "-" )* alphadigit
toplabel = alpha | alpha ( alphadigit | "-" )* alphadigit
alphadigit = alpha | digit
hostnumber = digits "." digits "." digits "." digits
port = digits
user = ( uchar | ";" | "?" | "&" | "=" )*
password = ( uchar | ";" | "?" | "&" | "=" )*
urlpath = xchar* ; depends on protocol see section 3.1
; The predefined schemes:
; FTP (see also RFC959)
ftpurl = "ftp://" login ( "/" fpath [ ";type=" ftptype )?)?
fpath = fsegment ( "/" fsegment )*
fsegment = ( uchar | "?" | ":" | "@" | "&" | "=" )*
ftptype = "A" | "I" | "D" | "a" | "i" | "d"
; FILE
fileurl = "file://" ( host | "localhost" )? "/" fpath
; HTTP/HTTPs
httpurl = "http://" hostport ( "/" hpath ( "?" search )?)?
httpsurl = "https://" hostport ( "/" hpath ( "?" search )?)?
hpath = hsegment ( "/" hsegment )*
hsegment = ( uchar | ";" | ":" | "@" | "&" | "=" )*
search = ( uchar | ";" | ":" | "@" | "&" | "=" )*
; GOPHER (see also RFC1436)
gopherurl = "gopher://" hostport ( / ( gtype ( selector
( "%09" search ( "%09" gopher+_string )?)?)?)?)?
gtype = xchar
selector = xchar*
gopher+_string = xchar*
; MAILTO (see also RFC822)
mailtourl = "mailto:" encoded822addr
encoded822addr = xchar+ ; further defined in RFC822
; NEWS (see also RFC1036)
newsurl = "news:" grouppart
grouppart = "*" | group | article
group = alpha ( alpha | digit | "-" | "." | "+" | "_" )*
article = ( uchar | ";" | "/" | "?" | ":" | "&" | "=" )+ "@" host
; NNTP (see also RFC977)
nntpurl = "nntp://" hostport "/" group ( "/" digits )?
; TELNET
telneturl = "telnet://" login ( "/" )?
; WAIS (see also RFC1625)
waisurl = waisdatabase | waisindex | waisdoc
waisdatabase = "wais://" hostport "/" database
waisindex = "wais://" hostport "/" database "?" search
waisdoc = "wais://" hostport "/" database "/" wtype "/" wpath
database = uchar*
wtype = uchar*
wpath = uchar*
; PROSPERO
prosperourl = "prospero://" hostport "/" ppath ( fieldspec )*
ppath = psegment ( "/" psegment )*
psegment = ( uchar | "?" | ":" | "@" | "&" | "=" )*
fieldspec = ";" fieldname "=" fieldvalue
fieldname = ( uchar | "?" | ":" | "@" | "&" )*
fieldvalue = ( uchar | "?" | ":" | "@" | "&" )*
; Miscellaneous definitions
lowalpha = "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" |
"i" | "j" | "k" | "l" | "m" | "n" | "o" | "p" |
"q" | "r" | "s" | "t" | "u" | "v" | "w" | "x" |
"y" | "z"
hialpha = "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" |
"J" | "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" |
"S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z"
alpha = lowalpha | hialpha
digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" |
"8" | "9"
safe = "$" | "-" | "_" | "." | "+"
extra = "!" | "*" | "'" | "(" | ")" | ","
national = "{" | "}" | "|" | "\" | "^" | "~" | "[" | "]" | "`"
punctuation = "<" | ">" | "#" | "%" | <">
reserved = ";" | "/" | "?" | ":" | "@" | "&" | "="
hex = digit | "A" | "B" | "C" | "D" | "E" | "F" |
"a" | "b" | "c" | "d" | "e" | "f"
escape = "%" hex hex
unreserved = alpha | digit | safe | extra
uchar = unreserved | escape
xchar = unreserved | reserved | escape
digits = digit+