ASPN ActiveState Programmer Network  
ActiveState, a division of Sophos
/ Home / Perl / PHP / Python / Tcl / XSLT /
/ Safari / My ASPN /
Cookbooks | Documentation | Mailing Lists | Modules | News Feeds | Products | User Groups
Submit Recipe
My Recipes

All Recipes
All Cookbooks


View by Category

Title: Finding URLs in text -- the COMPLETE way
Submitter: Ken Simpson (other recipes)
Last Updated: 2001/05/28
Version no: 1.0
Category: Networking

 

4 stars 5 vote(s)


Editors pick

Description:

This substitution expression, posted by Abigail to comp.lang.perl.misc on 08/14/2000,
matches URLs and turns them into HTML links. This regular expression follows
the complete URL grammar as defined in RFC 1738.

Usage: Text Source

$string =~ s<
(?:http://(?:(?:(?:(?:(?:[a-zA-Z\d](?:(?:[a-zA-Z\d]|-)*[a-zA-Z\d])?)\.
)*(?:[a-zA-Z](?:(?:[a-zA-Z\d]|-)*[a-zA-Z\d])?))|(?:(?:\d+)(?:\.(?:\d+)
){3}))(?::(?:\d+))?)(?:/(?:(?:(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a-fA-F
\d]{2}))|[;:@&=])*)(?:/(?:(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a-fA-F\d]{
2}))|[;:@&=])*))*)(?:\?(?:(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a-fA-F\d]{
2}))|[;:@&=])*))?)?)|(?:ftp://(?:(?:(?:(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?
:%[a-fA-F\d]{2}))|[;?&=])*)(?::(?:(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a-
fA-F\d]{2}))|[;?&=])*))?@)?(?:(?:(?:(?:(?:[a-zA-Z\d](?:(?:[a-zA-Z\d]|-
)*[a-zA-Z\d])?)\.)*(?:[a-zA-Z](?:(?:[a-zA-Z\d]|-)*[a-zA-Z\d])?))|(?:(?
:\d+)(?:\.(?:\d+)){3}))(?::(?:\d+))?))(?:/(?:(?:(?:(?:[a-zA-Z\d$\-_.+!
*'(),]|(?:%[a-fA-F\d]{2}))|[?:@&=])*)(?:/(?:(?:(?:[a-zA-Z\d$\-_.+!*'()
,]|(?:%[a-fA-F\d]{2}))|[?:@&=])*))*)(?:;type=[AIDaid])?)?)|(?:news:(?:
(?:(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a-fA-F\d]{2}))|[;/?:&=])+@(?:(?:(
?:(?:[a-zA-Z\d](?:(?:[a-zA-Z\d]|-)*[a-zA-Z\d])?)\.)*(?:[a-zA-Z](?:(?:[
a-zA-Z\d]|-)*[a-zA-Z\d])?))|(?:(?:\d+)(?:\.(?:\d+)){3})))|(?:[a-zA-Z](
?:[a-zA-Z\d]|[_.+-])*)|\*))|(?:nntp://(?:(?:(?:(?:(?:[a-zA-Z\d](?:(?:[
a-zA-Z\d]|-)*[a-zA-Z\d])?)\.)*(?:[a-zA-Z](?:(?:[a-zA-Z\d]|-)*[a-zA-Z\d
])?))|(?:(?:\d+)(?:\.(?:\d+)){3}))(?::(?:\d+))?)/(?:[a-zA-Z](?:[a-zA-Z
\d]|[_.+-])*)(?:/(?:\d+))?)|(?:telnet://(?:(?:(?:(?:(?:[a-zA-Z\d$\-_.+
!*'(),]|(?:%[a-fA-F\d]{2}))|[;?&=])*)(?::(?:(?:(?:[a-zA-Z\d$\-_.+!*'()
,]|(?:%[a-fA-F\d]{2}))|[;?&=])*))?@)?(?:(?:(?:(?:(?:[a-zA-Z\d](?:(?:[a
-zA-Z\d]|-)*[a-zA-Z\d])?)\.)*(?:[a-zA-Z](?:(?:[a-zA-Z\d]|-)*[a-zA-Z\d]
)?))|(?:(?:\d+)(?:\.(?:\d+)){3}))(?::(?:\d+))?))/?)|(?:gopher://(?:(?:
(?:(?:(?:[a-zA-Z\d](?:(?:[a-zA-Z\d]|-)*[a-zA-Z\d])?)\.)*(?:[a-zA-Z](?:
(?:[a-zA-Z\d]|-)*[a-zA-Z\d])?))|(?:(?:\d+)(?:\.(?:\d+)){3}))(?::(?:\d+
))?)(?:/(?:[a-zA-Z\d$\-_.+!*'(),;/?:@&=]|(?:%[a-fA-F\d]{2}))(?:(?:(?:[
a-zA-Z\d$\-_.+!*'(),;/?:@&=]|(?:%[a-fA-F\d]{2}))*)(?:%09(?:(?:(?:[a-zA
-Z\d$\-_.+!*'(),]|(?:%[a-fA-F\d]{2}))|[;:@&=])*)(?:%09(?:(?:[a-zA-Z\d$
\-_.+!*'(),;/?:@&=]|(?:%[a-fA-F\d]{2}))*))?)?)?)?)|(?:wais://(?:(?:(?:
(?:(?:[a-zA-Z\d](?:(?:[a-zA-Z\d]|-)*[a-zA-Z\d])?)\.)*(?:[a-zA-Z](?:(?:
[a-zA-Z\d]|-)*[a-zA-Z\d])?))|(?:(?:\d+)(?:\.(?:\d+)){3}))(?::(?:\d+))?
)/(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a-fA-F\d]{2}))*)(?:(?:/(?:(?:[a-zA
-Z\d$\-_.+!*'(),]|(?:%[a-fA-F\d]{2}))*)/(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(
?:%[a-fA-F\d]{2}))*))|\?(?:(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a-fA-F\d]
{2}))|[;:@&=])*))?)|(?:mailto:(?:(?:[a-zA-Z\d$\-_.+!*'(),;/?:@&=]|(?:%
[a-fA-F\d]{2}))+))|(?:file://(?:(?:(?:(?:(?:[a-zA-Z\d](?:(?:[a-zA-Z\d]
|-)*[a-zA-Z\d])?)\.)*(?:[a-zA-Z](?:(?:[a-zA-Z\d]|-)*[a-zA-Z\d])?))|(?:
(?:\d+)(?:\.(?:\d+)){3}))|localhost)?/(?:(?:(?:(?:[a-zA-Z\d$\-_.+!*'()
,]|(?:%[a-fA-F\d]{2}))|[?:@&=])*)(?:/(?:(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(
?:%[a-fA-F\d]{2}))|[?:@&=])*))*))|(?:prospero://(?:(?:(?:(?:(?:[a-zA-Z
\d](?:(?:[a-zA-Z\d]|-)*[a-zA-Z\d])?)\.)*(?:[a-zA-Z](?:(?:[a-zA-Z\d]|-)
*[a-zA-Z\d])?))|(?:(?:\d+)(?:\.(?:\d+)){3}))(?::(?:\d+))?)/(?:(?:(?:(?
:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a-fA-F\d]{2}))|[?:@&=])*)(?:/(?:(?:(?:[a-
zA-Z\d$\-_.+!*'(),]|(?:%[a-fA-F\d]{2}))|[?:@&=])*))*)(?:(?:;(?:(?:(?:[
a-zA-Z\d$\-_.+!*'(),]|(?:%[a-fA-F\d]{2}))|[?:@&])*)=(?:(?:(?:[a-zA-Z\d
$\-_.+!*'(),]|(?:%[a-fA-F\d]{2}))|[?:@&])*)))*)|(?:ldap://(?:(?:(?:(?:
(?:(?:[a-zA-Z\d](?:(?:[a-zA-Z\d]|-)*[a-zA-Z\d])?)\.)*(?:[a-zA-Z](?:(?:
[a-zA-Z\d]|-)*[a-zA-Z\d])?))|(?:(?:\d+)(?:\.(?:\d+)){3}))(?::(?:\d+))?
))?/(?:(?:(?:(?:(?:(?:(?:[a-zA-Z\d]|%(?:3\d|[46][a-fA-F\d]|[57][Aa\d])
)|(?:%20))+|(?:OID|oid)\.(?:(?:\d+)(?:\.(?:\d+))*))(?:(?:%0[Aa])?(?:%2
0)*)=(?:(?:%0[Aa])?(?:%20)*))?(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a-fA-F
\d]{2}))*))(?:(?:(?:%0[Aa])?(?:%20)*)\+(?:(?:%0[Aa])?(?:%20)*)(?:(?:(?
:(?:(?:[a-zA-Z\d]|%(?:3\d|[46][a-fA-F\d]|[57][Aa\d]))|(?:%20))+|(?:OID
|oid)\.(?:(?:\d+)(?:\.(?:\d+))*))(?:(?:%0[Aa])?(?:%20)*)=(?:(?:%0[Aa])
?(?:%20)*))?(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a-fA-F\d]{2}))*)))*)(?:(
?:(?:(?:%0[Aa])?(?:%20)*)(?:[;,])(?:(?:%0[Aa])?(?:%20)*))(?:(?:(?:(?:(
?:(?:[a-zA-Z\d]|%(?:3\d|[46][a-fA-F\d]|[57][Aa\d]))|(?:%20))+|(?:OID|o
id)\.(?:(?:\d+)(?:\.(?:\d+))*))(?:(?:%0[Aa])?(?:%20)*)=(?:(?:%0[Aa])?(
?:%20)*))?(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a-fA-F\d]{2}))*))(?:(?:(?:
%0[Aa])?(?:%20)*)\+(?:(?:%0[Aa])?(?:%20)*)(?:(?:(?:(?:(?:[a-zA-Z\d]|%(
?:3\d|[46][a-fA-F\d]|[57][Aa\d]))|(?:%20))+|(?:OID|oid)\.(?:(?:\d+)(?:
\.(?:\d+))*))(?:(?:%0[Aa])?(?:%20)*)=(?:(?:%0[Aa])?(?:%20)*))?(?:(?:[a
-zA-Z\d$\-_.+!*'(),]|(?:%[a-fA-F\d]{2}))*)))*))*(?:(?:(?:%0[Aa])?(?:%2
0)*)(?:[;,])(?:(?:%0[Aa])?(?:%20)*))?)(?:\?(?:(?:(?:(?:[a-zA-Z\d$\-_.+
!*'(),]|(?:%[a-fA-F\d]{2}))+)(?:,(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a-f
A-F\d]{2}))+))*)?)(?:\?(?:base|one|sub)(?:\?(?:((?:[a-zA-Z\d$\-_.+!*'(
),;/?:@&=]|(?:%[a-fA-F\d]{2}))+)))?)?)?)|(?:(?:z39\.50[rs])://(?:(?:(?
:(?:(?:[a-zA-Z\d](?:(?:[a-zA-Z\d]|-)*[a-zA-Z\d])?)\.)*(?:[a-zA-Z](?:(?
:[a-zA-Z\d]|-)*[a-zA-Z\d])?))|(?:(?:\d+)(?:\.(?:\d+)){3}))(?::(?:\d+))
?)(?:/(?:(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a-fA-F\d]{2}))+)(?:\+(?:(?:
[a-zA-Z\d$\-_.+!*'(),]|(?:%[a-fA-F\d]{2}))+))*(?:\?(?:(?:[a-zA-Z\d$\-_
.+!*'(),]|(?:%[a-fA-F\d]{2}))+))?)?(?:;esn=(?:(?:[a-zA-Z\d$\-_.+!*'(),
]|(?:%[a-fA-F\d]{2}))+))?(?:;rs=(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a-fA
-F\d]{2}))+)(?:\+(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a-fA-F\d]{2}))+))*)
?))|(?:cid:(?:(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a-fA-F\d]{2}))|[;?:@&=
])*))|(?:mid:(?:(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a-fA-F\d]{2}))|[;?:@
&=])*)(?:/(?:(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a-fA-F\d]{2}))|[;?:@&=]
)*))?)|(?:vemmi://(?:(?:(?:(?:(?:[a-zA-Z\d](?:(?:[a-zA-Z\d]|-)*[a-zA-Z
\d])?)\.)*(?:[a-zA-Z](?:(?:[a-zA-Z\d]|-)*[a-zA-Z\d])?))|(?:(?:\d+)(?:.(?:\d+)){3}))(?::(?:\d+))?)(?:/(?:(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a
-fA-F\d]{2}))|[/?:@&=])*)(?:(?:;(?:(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a
-fA-F\d]{2}))|[/?:@&])*)=(?:(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a-fA-F\d
]{2}))|[/?:@&])*))*))?)|(?:imap://(?:(?:(?:(?:(?:(?:(?:[a-zA-Z\d$\-_.+
!*'(),]|(?:%[a-fA-F\d]{2}))|[&=~])+)(?:(?:;[Aa][Uu][Tt][Hh]=(?:\*|(?:(
?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a-fA-F\d]{2}))|[&=~])+))))?)|(?:(?:;[
Aa][Uu][Tt][Hh]=(?:\*|(?:(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a-fA-F\d]{2
}))|[&=~])+)))(?:(?:(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a-fA-F\d]{2}))|[
&=~])+))?))@)?(?:(?:(?:(?:(?:[a-zA-Z\d](?:(?:[a-zA-Z\d]|-)*[a-zA-Z\d])
?)\.)*(?:[a-zA-Z](?:(?:[a-zA-Z\d]|-)*[a-zA-Z\d])?))|(?:(?:\d+)(?:\.(?:
\d+)){3}))(?::(?:\d+))?))/(?:(?:(?:(?:(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:
%[a-fA-F\d]{2}))|[&=~:@/])+)?;[Tt][Yy][Pp][Ee]=(?:[Ll](?:[Ii][Ss][Tt]|
[Ss][Uu][Bb])))|(?:(?:(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a-fA-F\d]{2}))
|[&=~:@/])+)(?:\?(?:(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a-fA-F\d]{2}))|[
&=~:@/])+))?(?:(?:;[Uu][Ii][Dd][Vv][Aa][Ll][Ii][Dd][Ii][Tt][Yy]=(?:[1-
9]\d*)))?)|(?:(?:(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a-fA-F\d]{2}))|[&=~
:@/])+)(?:(?:;[Uu][Ii][Dd][Vv][Aa][Ll][Ii][Dd][Ii][Tt][Yy]=(?:[1-9]\d*
)))?(?:/;[Uu][Ii][Dd]=(?:[1-9]\d*))(?:(?:/;[Ss][Ee][Cc][Tt][Ii][Oo][Nn
]=(?:(?:(?:[a-zA-Z\d$\-_.+!*'(),]|(?:%[a-fA-F\d]{2}))|[&=~:@/])+)))?))
)?)|(?:nfs:(?:(?://(?:(?:(?:(?:(?:[a-zA-Z\d](?:(?:[a-zA-Z\d]|-)*[a-zA-
Z\d])?)\.)*(?:[a-zA-Z](?:(?:[a-zA-Z\d]|-)*[a-zA-Z\d])?))|(?:(?:\d+)(?:
\.(?:\d+)){3}))(?::(?:\d+))?)(?:(?:/(?:(?:(?:(?:(?:[a-zA-Z\d\$\-_.!~*'
(),])|(?:%[a-fA-F\d]{2})|[:@&=+])*)(?:/(?:(?:(?:[a-zA-Z\d\$\-_.!~*'(),
])|(?:%[a-fA-F\d]{2})|[:@&=+])*))*)?)))?)|(?:/(?:(?:(?:(?:(?:[a-zA-Z\d
\$\-_.!~*'(),])|(?:%[a-fA-F\d]{2})|[:@&=+])*)(?:/(?:(?:(?:[a-zA-Z\d\$-_.!~*'(),])|(?:%[a-fA-F\d]{2})|[:@&=+])*))*)?))|(?:(?:(?:(?:(?:[a-zA-
Z\d\$\-_.!~*'(),])|(?:%[a-fA-F\d]{2})|[:@&=+])*)(?:/(?:(?:(?:[a-zA-Z\d
\$\-_.!~*'(),])|(?:%[a-fA-F\d]{2})|[:@&=+])*))*)?)))
><<a href = "$1">$1</a>>gx;

The license for this recipe is available here.

Discussion:



Add comment

Number of comments: 11

Interesting, but totally unusable, David Levin, 2001/06/03
This is more appropriate for a Perl Journal competition, but not as an example of what should be submitted. I learned nothing from it, cannot, and will not use it. I fail to see why it deserves the editor's choice status.
Add comment

Unusable?, Ray Graham, 2001/06/04
I beg to differ. The regEx definately is usable, though I agree that, from the author's comments, you can't learn much easily. The author just needed to show the regEx and then break it down for those wanting to know more. Depends on why you use the Rx Cookbook. If you use it solely to learn, then this post is meaningless. But if you are looking for the A+ solution to a very common problem, then this regEx will save you weeks of frustration.
Add comment

Minor typo found, Roy Beatty, 2001/08/10
On line 15, a ":" should be inserted in column 26.

...

Nah, just kidding. But I'm curious: Does this entry hold the record for being the longest regex with a useful purpose?
Add comment

Rx Toolkit, Serge Sozonoff, 2001/12/12
Anyone thrown this into the Rx Toolkit in Komodo yet :-) Serge
Add comment

Slight modification required for https, John Liu, 2002/03/13
Change http to https? and the regular expression works beautifully.

If you are after learning what this regular expression does, try reading the RFC (here: http://www.w3.org/Addressing/rfc1738.txt) instead. This regular expression I imagine would be a direct translation of the BNF grammar defined.

Add comment

I couldn't get it to work, neil Burnett, 2002/12/16
Did anyone actually test this regular expression? Here is what I got, and I don't think I even want to start to debug it:-): Sequence (? ...) not recognized in regex; marked by Did anyone actually test this regular expression? Here is what I got, and I don't think I even want to start to debug it:-): Sequence (? ...) not recognized in regex; marked by
Add comment

Mistakes, Greg Hart, 2003/07/31
There's only a couple mistakes that I could find. When copying the code, it retains the line breaks, so those have to be deleted when you copy. On the last line, it's supposed to be $& not $1. Just thought I'd let everybody know.
Add comment

another notes, Slava Iutin, 2004/10/22
Yes, also it does not match https and ftps protocols. Very easy to change 'http' with 'https?' and 'ftp' with 'ftps?'
Add comment

much easier, Slava Iutin, 2004/10/22
/\bhttps?:[^\s<>\"]+[\w\/]/i
Add comment

Had to add another grouping for it to work in python, Sam Peterson, 2005/02/28
I finally got this to work in python. I had to add another layer of parentheses around the entire thing and remove all the breaks. 'Twas a nightmare. Since I don't have an easy way of dispaying the pattern all on one line without completely hosing this page, I've encoded it in base64, then decoded it into a variable and finally compiled it as a pattern. It seems to do the job though it returns two results, the first of which is the match, the second, an empty string. Does the job, but I'm through playing with this for now. It would have been nice if the author of the regex had done something similar and more cross platform in the first place (not knocking their work though, it's a very effective RE).

Here's what I came up with in python:

#!/usr/bin/python

import base64, re

pattern_in_base64 = r"""
KCg/Omh0dHA6Ly8oPzooPzooPzooPzooPzpbYS16QS1aXGRdKD86KD86W2EtekEtWlxkXXwtKSpb
YS16QS1aXGRdKT8pXC4pKig/OlthLXpBLVpdKD86KD86W2EtekEtWlxkXXwtKSpbYS16QS1aXGRd
KT8pKXwoPzooPzpcZCspKD86XC4oPzpcZCspKXszfSkpKD86Oig/OlxkKykpPykoPzovKD86KD86
KD86KD86W2EtekEtWlxkJFwtXy4rISonKCksXXwoPzolW2EtZkEtRlxkXXsyfSkpfFs7OkAmPV0p
KikoPzovKD86KD86KD86W2EtekEtWlxkJFwtXy4rISonKCksXXwoPzolW2EtZkEtRlxkXXsyfSkp
fFs7OkAmPV0pKikpKikoPzpcPyg/Oig/Oig/OlthLXpBLVpcZCRcLV8uKyEqJygpLF18KD86JVth
LWZBLUZcZF17Mn0pKXxbOzpAJj1dKSopKT8pPyl8KD86ZnRwOi8vKD86KD86KD86KD86KD86W2Et
ekEtWlxkJFwtXy4rISonKCksXXwoPzolW2EtZkEtRlxkXXsyfSkpfFs7PyY9XSkqKSg/OjooPzoo
PzooPzpbYS16QS1aXGQkXC1fLishKicoKSxdfCg/OiVbYS1mQS1GXGRdezJ9KSl8Wzs/Jj1dKSop
KT9AKT8oPzooPzooPzooPzooPzpbYS16QS1aXGRdKD86KD86W2EtekEtWlxkXXwtKSpbYS16QS1a
XGRdKT8pXC4pKig/OlthLXpBLVpdKD86KD86W2EtekEtWlxkXXwtKSpbYS16QS1aXGRdKT8pKXwo
PzooPzpcZCspKD86XC4oPzpcZCspKXszfSkpKD86Oig/OlxkKykpPykpKD86Lyg/Oig/Oig/Oig/
OlthLXpBLVpcZCRcLV8uKyEqJygpLF18KD86JVthLWZBLUZcZF17Mn0pKXxbPzpAJj1dKSopKD86
Lyg/Oig/Oig/OlthLXpBLVpcZCRcLV8uKyEqJygpLF18KD86JVthLWZBLUZcZF17Mn0pKXxbPzpA
Jj1dKSopKSopKD86O3R5cGU9W0FJRGFpZF0pPyk/KXwoPzpuZXdzOig/Oig/Oig/Oig/OlthLXpB
LVpcZCRcLV8uKyEqJygpLF18KD86JVthLWZBLUZcZF17Mn0pKXxbOy8/OiY9XSkrQCg/Oig/Oig/
Oig/OlthLXpBLVpcZF0oPzooPzpbYS16QS1aXGRdfC0pKlthLXpBLVpcZF0pPylcLikqKD86W2Et
ekEtWl0oPzooPzpbYS16QS1aXGRdfC0pKlthLXpBLVpcZF0pPykpfCg/Oig/OlxkKykoPzpcLig/
OlxkKykpezN9KSkpfCg/OlthLXpBLVpdKD86W2EtekEtWlxkXXxbXy4rLV0pKil8XCopKXwoPzpu
bnRwOi8vKD86KD86KD86KD86KD86W2EtekEtWlxkXSg/Oig/OlthLXpBLVpcZF18LSkqW2EtekEt
WlxkXSk/KVwuKSooPzpbYS16QS1aXSg/Oig/OlthLXpBLVpcZF18LSkqW2EtekEtWlxkXSk/KSl8
KD86KD86XGQrKSg/OlwuKD86XGQrKSl7M30pKSg/OjooPzpcZCspKT8pLyg/OlthLXpBLVpdKD86
W2EtekEtWlxkXXxbXy4rLV0pKikoPzovKD86XGQrKSk/KXwoPzp0ZWxuZXQ6Ly8oPzooPzooPzoo
PzooPzpbYS16QS1aXGQkXC1fLishKicoKSxdfCg/OiVbYS1mQS1GXGRdezJ9KSl8Wzs/Jj1dKSop
KD86Oig/Oig/Oig/OlthLXpBLVpcZCRcLV8uKyEqJygpLF18KD86JVthLWZBLUZcZF17Mn0pKXxb
Oz8mPV0pKikpP0ApPyg/Oig/Oig/Oig/Oig/OlthLXpBLVpcZF0oPzooPzpbYS16QS1aXGRdfC0p
KlthLXpBLVpcZF0pPylcLikqKD86W2EtekEtWl0oPzooPzpbYS16QS1aXGRdfC0pKlthLXpBLVpc
ZF0pPykpfCg/Oig/OlxkKykoPzpcLig/OlxkKykpezN9KSkoPzo6KD86XGQrKSk/KSkvPyl8KD86
Z29waGVyOi8vKD86KD86KD86KD86KD86W2EtekEtWlxkXSg/Oig/OlthLXpBLVpcZF18LSkqW2Et
ekEtWlxkXSk/KVwuKSooPzpbYS16QS1aXSg/Oig/OlthLXpBLVpcZF18LSkqW2EtekEtWlxkXSk/
KSl8KD86KD86XGQrKSg/OlwuKD86XGQrKSl7M30pKSg/OjooPzpcZCspKT8pKD86Lyg/OlthLXpB
LVpcZCRcLV8uKyEqJygpLDsvPzpAJj1dfCg/OiVbYS1mQS1GXGRdezJ9KSkoPzooPzooPzpbYS16
QS1aXGQkXC1fLishKicoKSw7Lz86QCY9XXwoPzolW2EtZkEtRlxkXXsyfSkpKikoPzolMDkoPzoo
PzooPzpbYS16QS1aXGQkXC1fLishKicoKSxdfCg/OiVbYS1mQS1GXGRdezJ9KSl8Wzs6QCY9XSkq
KSg/OiUwOSg/Oig/OlthLXpBLVpcZCRcLV8uKyEqJygpLDsvPzpAJj1dfCg/OiVbYS1mQS1GXGRd
ezJ9KSkqKSk/KT8pPyk/KXwoPzp3YWlzOi8vKD86KD86KD86KD86KD86W2EtekEtWlxkXSg/Oig/
OlthLXpBLVpcZF18LSkqW2EtekEtWlxkXSk/KVwuKSooPzpbYS16QS1aXSg/Oig/OlthLXpBLVpc
ZF18LSkqW2EtekEtWlxkXSk/KSl8KD86KD86XGQrKSg/OlwuKD86XGQrKSl7M30pKSg/OjooPzpc
ZCspKT8pLyg/Oig/OlthLXpBLVpcZCRcLV8uKyEqJygpLF18KD86JVthLWZBLUZcZF17Mn0pKSop
KD86KD86Lyg/Oig/OlthLXpBLVpcZCRcLV8uKyEqJygpLF18KD86JVthLWZBLUZcZF17Mn0pKSop
Lyg/Oig/OlthLXpBLVpcZCRcLV8uKyEqJygpLF18KD86JVthLWZBLUZcZF17Mn0pKSopKXxcPyg/
Oig/Oig/OlthLXpBLVpcZCRcLV8uKyEqJygpLF18KD86JVthLWZBLUZcZF17Mn0pKXxbOzpAJj1d
KSopKT8pfCg/Om1haWx0bzooPzooPzpbYS16QS1aXGQkXC1fLishKicoKSw7Lz86QCY9XXwoPzol
W2EtZkEtRlxkXXsyfSkpKykpfCg/OmZpbGU6Ly8oPzooPzooPzooPzooPzpbYS16QS1aXGRdKD86
KD86W2EtekEtWlxkXXwtKSpbYS16QS1aXGRdKT8pXC4pKig/OlthLXpBLVpdKD86KD86W2EtekEt
WlxkXXwtKSpbYS16QS1aXGRdKT8pKXwoPzooPzpcZCspKD86XC4oPzpcZCspKXszfSkpfGxvY2Fs
aG9zdCk/Lyg/Oig/Oig/Oig/OlthLXpBLVpcZCRcLV8uKyEqJygpLF18KD86JVthLWZBLUZcZF17
Mn0pKXxbPzpAJj1dKSopKD86Lyg/Oig/Oig/OlthLXpBLVpcZCRcLV8uKyEqJygpLF18KD86JVth
LWZBLUZcZF17Mn0pKXxbPzpAJj1dKSopKSopKXwoPzpwcm9zcGVybzovLyg/Oig/Oig/Oig/Oig/
OlthLXpBLVpcZF0oPzooPzpbYS16QS1aXGRdfC0pKlthLXpBLVpcZF0pPylcLikqKD86W2EtekEt
Wl0oPzooPzpbYS16QS1aXGRdfC0pKlthLXpBLVpcZF0pPykpfCg/Oig/OlxkKykoPzpcLig/Olxk
KykpezN9KSkoPzo6KD86XGQrKSk/KS8oPzooPzooPzooPzpbYS16QS1aXGQkXC1fLishKicoKSxd
fCg/OiVbYS1mQS1GXGRdezJ9KSl8Wz86QCY9XSkqKSg/Oi8oPzooPzooPzpbYS16QS1aXGQkXC1f
LishKicoKSxdfCg/OiVbYS1mQS1GXGRdezJ9KSl8Wz86QCY9XSkqKSkqKSg/Oig/OjsoPzooPzoo
PzpbYS16QS1aXGQkXC1fLishKicoKSxdfCg/OiVbYS1mQS1GXGRdezJ9KSl8Wz86QCZdKSopPSg/
Oig/Oig/OlthLXpBLVpcZCRcLV8uKyEqJygpLF18KD86JVthLWZBLUZcZF17Mn0pKXxbPzpAJl0p
KikpKSopfCg/OmxkYXA6Ly8oPzooPzooPzooPzooPzooPzpbYS16QS1aXGRdKD86KD86W2EtekEt
WlxkXXwtKSpbYS16QS1aXGRdKT8pXC4pKig/OlthLXpBLVpdKD86KD86W2EtekEtWlxkXXwtKSpb
YS16QS1aXGRdKT8pKXwoPzooPzpcZCspKD86XC4oPzpcZCspKXszfSkpKD86Oig/OlxkKykpPykp
Py8oPzooPzooPzooPzooPzooPzooPzpbYS16QS1aXGRdfCUoPzozXGR8WzQ2XVthLWZBLUZcZF18
WzU3XVtBYVxkXSkpfCg/OiUyMCkpK3woPzpPSUR8b2lkKVwuKD86KD86XGQrKSg/OlwuKD86XGQr
KSkqKSkoPzooPzolMFtBYV0pPyg/OiUyMCkqKT0oPzooPzolMFtBYV0pPyg/OiUyMCkqKSk/KD86
KD86W2EtekEtWlxkJFwtXy4rISonKCksXXwoPzolW2EtZkEtRlxkXXsyfSkpKikpKD86KD86KD86
JTBbQWFdKT8oPzolMjApKilcKyg/Oig/OiUwW0FhXSk/KD86JTIwKSopKD86KD86KD86KD86KD86
W2EtekEtWlxkXXwlKD86M1xkfFs0Nl1bYS1mQS1GXGRdfFs1N11bQWFcZF0pKXwoPzolMjApKSt8
KD86T0lEfG9pZClcLig/Oig/OlxkKykoPzpcLig/OlxkKykpKikpKD86KD86JTBbQWFdKT8oPzol
MjApKik9KD86KD86JTBbQWFdKT8oPzolMjApKikpPyg/Oig/OlthLXpBLVpcZCRcLV8uKyEqJygp
LF18KD86JVthLWZBLUZcZF17Mn0pKSopKSkqKSg/Oig/Oig/Oig/OiUwW0FhXSk/KD86JTIwKSop
KD86WzssXSkoPzooPzolMFtBYV0pPyg/OiUyMCkqKSkoPzooPzooPzooPzooPzooPzpbYS16QS1a
XGRdfCUoPzozXGR8WzQ2XVthLWZBLUZcZF18WzU3XVtBYVxkXSkpfCg/OiUyMCkpK3woPzpPSUR8
b2lkKVwuKD86KD86XGQrKSg/OlwuKD86XGQrKSkqKSkoPzooPzolMFtBYV0pPyg/OiUyMCkqKT0o
PzooPzolMFtBYV0pPyg/OiUyMCkqKSk/KD86KD86W2EtekEtWlxkJFwtXy4rISonKCksXXwoPzol
W2EtZkEtRlxkXXsyfSkpKikpKD86KD86KD86JTBbQWFdKT8oPzolMjApKilcKyg/Oig/OiUwW0Fh
XSk/KD86JTIwKSopKD86KD86KD86KD86KD86W2EtekEtWlxkXXwlKD86M1xkfFs0Nl1bYS1mQS1G
XGRdfFs1N11bQWFcZF0pKXwoPzolMjApKSt8KD86T0lEfG9pZClcLig/Oig/OlxkKykoPzpcLig/
OlxkKykpKikpKD86KD86JTBbQWFdKT8oPzolMjApKik9KD86KD86JTBbQWFdKT8oPzolMjApKikp
Pyg/Oig/OlthLXpBLVpcZCRcLV8uKyEqJygpLF18KD86JVthLWZBLUZcZF17Mn0pKSopKSkqKSkq
KD86KD86KD86JTBbQWFdKT8oPzolMjApKikoPzpbOyxdKSg/Oig/OiUwW0FhXSk/KD86JTIwKSop
KT8pKD86XD8oPzooPzooPzooPzpbYS16QS1aXGQkXC1fLishKicoKSxdfCg/OiVbYS1mQS1GXGRd
ezJ9KSkrKSg/OiwoPzooPzpbYS16QS1aXGQkXC1fLishKicoKSxdfCg/OiVbYS1mQS1GXGRdezJ9
KSkrKSkqKT8pKD86XD8oPzpiYXNlfG9uZXxzdWIpKD86XD8oPzooKD86W2EtekEtWlxkJFwtXy4r
ISonKCksOy8/OkAmPV18KD86JVthLWZBLUZcZF17Mn0pKSspKSk/KT8pPyl8KD86KD86ejM5XC41
MFtyc10pOi8vKD86KD86KD86KD86KD86W2EtekEtWlxkXSg/Oig/OlthLXpBLVpcZF18LSkqW2Et
ekEtWlxkXSk/KVwuKSooPzpbYS16QS1aXSg/Oig/OlthLXpBLVpcZF18LSkqW2EtekEtWlxkXSk/
KSl8KD86KD86XGQrKSg/OlwuKD86XGQrKSl7M30pKSg/OjooPzpcZCspKT8pKD86Lyg/Oig/Oig/
OlthLXpBLVpcZCRcLV8uKyEqJygpLF18KD86JVthLWZBLUZcZF17Mn0pKSspKD86XCsoPzooPzpb
YS16QS1aXGQkXC1fLishKicoKSxdfCg/OiVbYS1mQS1GXGRdezJ9KSkrKSkqKD86XD8oPzooPzpb
YS16QS1aXGQkXC1fLishKicoKSxdfCg/OiVbYS1mQS1GXGRdezJ9KSkrKSk/KT8oPzo7ZXNuPSg/
Oig/OlthLXpBLVpcZCRcLV8uKyEqJygpLF18KD86JVthLWZBLUZcZF17Mn0pKSspKT8oPzo7cnM9
KD86KD86W2EtekEtWlxkJFwtXy4rISonKCksXXwoPzolW2EtZkEtRlxkXXsyfSkpKykoPzpcKyg/
Oig/OlthLXpBLVpcZCRcLV8uKyEqJygpLF18KD86JVthLWZBLUZcZF17Mn0pKSspKSopPykpfCg/
OmNpZDooPzooPzooPzpbYS16QS1aXGQkXC1fLishKicoKSxdfCg/OiVbYS1mQS1GXGRdezJ9KSl8
Wzs/OkAmPV0pKikpfCg/Om1pZDooPzooPzooPzpbYS16QS1aXGQkXC1fLishKicoKSxdfCg/OiVb
YS1mQS1GXGRdezJ9KSl8Wzs/OkAmPV0pKikoPzovKD86KD86KD86W2EtekEtWlxkJFwtXy4rISon
KCksXXwoPzolW2EtZkEtRlxkXXsyfSkpfFs7PzpAJj1dKSopKT8pfCg/OnZlbW1pOi8vKD86KD86
KD86KD86KD86W2EtekEtWlxkXSg/Oig/OlthLXpBLVpcZF18LSkqW2EtekEtWlxkXSk/KVwuKSoo
PzpbYS16QS1aXSg/Oig/OlthLXpBLVpcZF18LSkqW2EtekEtWlxkXSk/KSl8KD86KD86XGQrKSg/
Oi4oPzpcZCspKXszfSkpKD86Oig/OlxkKykpPykoPzovKD86KD86KD86W2EtekEtWlxkJFwtXy4r
ISonKCksXXwoPzolW2EtZkEtRlxkXXsyfSkpfFsvPzpAJj1dKSopKD86KD86Oyg/Oig/Oig/Olth
LXpBLVpcZCRcLV8uKyEqJygpLF18KD86JVthLWZBLUZcZF17Mn0pKXxbLz86QCZdKSopPSg/Oig/
Oig/OlthLXpBLVpcZCRcLV8uKyEqJygpLF18KD86JVthLWZBLUZcZF17Mn0pKXxbLz86QCZdKSop
KSopKT8pfCg/OmltYXA6Ly8oPzooPzooPzooPzooPzooPzooPzpbYS16QS1aXGQkXC1fLishKico
KSxdfCg/OiVbYS1mQS1GXGRdezJ9KSl8WyY9fl0pKykoPzooPzo7W0FhXVtVdV1bVHRdW0hoXT0o
PzpcKnwoPzooPzooPzpbYS16QS1aXGQkXC1fLishKicoKSxdfCg/OiVbYS1mQS1GXGRdezJ9KSl8
WyY9fl0pKykpKSk/KXwoPzooPzo7W0FhXVtVdV1bVHRdW0hoXT0oPzpcKnwoPzooPzooPzpbYS16
QS1aXGQkXC1fLishKicoKSxdfCg/OiVbYS1mQS1GXGRdezJ9KSl8WyY9fl0pKykpKSg/Oig/Oig/
Oig/OlthLXpBLVpcZCRcLV8uKyEqJygpLF18KD86JVthLWZBLUZcZF17Mn0pKXxbJj1+XSkrKSk/
KSlAKT8oPzooPzooPzooPzooPzpbYS16QS1aXGRdKD86KD86W2EtekEtWlxkXXwtKSpbYS16QS1a
XGRdKT8pXC4pKig/OlthLXpBLVpdKD86KD86W2EtekEtWlxkXXwtKSpbYS16QS1aXGRdKT8pKXwo
PzooPzpcZCspKD86XC4oPzpcZCspKXszfSkpKD86Oig/OlxkKykpPykpLyg/Oig/Oig/Oig/Oig/
Oig/OlthLXpBLVpcZCRcLV8uKyEqJygpLF18KD86JVthLWZBLUZcZF17Mn0pKXxbJj1+OkAvXSkr
KT87W1R0XVtZeV1bUHBdW0VlXT0oPzpbTGxdKD86W0lpXVtTc11bVHRdfFtTc11bVXVdW0JiXSkp
KXwoPzooPzooPzooPzpbYS16QS1aXGQkXC1fLishKicoKSxdfCg/OiVbYS1mQS1GXGRdezJ9KSl8
WyY9fjpAL10pKykoPzpcPyg/Oig/Oig/OlthLXpBLVpcZCRcLV8uKyEqJygpLF18KD86JVthLWZB
LUZcZF17Mn0pKXxbJj1+OkAvXSkrKSk/KD86KD86O1tVdV1bSWldW0RkXVtWdl1bQWFdW0xsXVtJ
aV1bRGRdW0lpXVtUdF1bWXldPSg/OlsxLTldXGQqKSkpPyl8KD86KD86KD86KD86W2EtekEtWlxk
JFwtXy4rISonKCksXXwoPzolW2EtZkEtRlxkXXsyfSkpfFsmPX46QC9dKSspKD86KD86O1tVdV1b
SWldW0RkXVtWdl1bQWFdW0xsXVtJaV1bRGRdW0lpXVtUdF1bWXldPSg/OlsxLTldXGQqKSkpPyg/
Oi87W1V1XVtJaV1bRGRdPSg/OlsxLTldXGQqKSkoPzooPzovO1tTc11bRWVdW0NjXVtUdF1bSWld
W09vXVtObl09KD86KD86KD86W2EtekEtWlxkJFwtXy4rISonKCksXXwoPzolW2EtZkEtRlxkXXsy
fSkpfFsmPX46QC9dKSspKSk/KSkpPyl8KD86bmZzOig/Oig/Oi8vKD86KD86KD86KD86KD86W2Et
ekEtWlxkXSg/Oig/OlthLXpBLVpcZF18LSkqW2EtekEtWlxkXSk/KVwuKSooPzpbYS16QS1aXSg/
Oig/OlthLXpBLVpcZF18LSkqW2EtekEtWlxkXSk/KSl8KD86KD86XGQrKSg/OlwuKD86XGQrKSl7
M30pKSg/OjooPzpcZCspKT8pKD86KD86Lyg/Oig/Oig/Oig/Oig/OlthLXpBLVpcZFwkXC1fLiF+
KicoKSxdKXwoPzolW2EtZkEtRlxkXXsyfSl8WzpAJj0rXSkqKSg/Oi8oPzooPzooPzpbYS16QS1a
XGRcJFwtXy4hfionKCksXSl8KD86JVthLWZBLUZcZF17Mn0pfFs6QCY9K10pKikpKik/KSkpPyl8
KD86Lyg/Oig/Oig/Oig/Oig/OlthLXpBLVpcZFwkXC1fLiF+KicoKSxdKXwoPzolW2EtZkEtRlxk
XXsyfSl8WzpAJj0rXSkqKSg/Oi8oPzooPzooPzpbYS16QS1aXGRcJC1fLiF+KicoKSxdKXwoPzol
W2EtZkEtRlxkXXsyfSl8WzpAJj0rXSkqKSkqKT8pKXwoPzooPzooPzooPzooPzpbYS16QS1aXGRc
JFwtXy4hfionKCksXSl8KD86JVthLWZBLUZcZF17Mn0pfFs6QCY9K10pKikoPzovKD86KD86KD86
W2EtekEtWlxkXCRcLV8uIX4qJygpLF0pfCg/OiVbYS1mQS1GXGRdezJ9KXxbOkAmPStdKSopKSop
PykpKSk="""

pattern = base64.decodestring(pattern_in_base64)
print pattern
matcher = re.compile(pattern)
the_string = 'http://www.example.com/ mailto:dude@example.com news:alt.example'
matches = matcher.findall(the_string)

print "Now printing matches"
for match in matches:
    print "Found url %s" % match[0]


This seems to do the trick for the most part, however it seems to produce two results with findall, one with the url, one an empty string. I'm way too scared to play with it further to figure out what to do, suffice to say, match[0] will get the job done.
Add comment

super!, nicolas pioli, 2005/07/12
j'ai utilisé avec succès vos travaux dans un script python: cf. recipe 252508 de doug tolton!
Add comment



Highest rated recipes:

1. Breaking down a URI into ...

2. Extracting HTML URL Links

3. Removing dangerous ...

4. Matching Royal Mail ...

5. Finding Palindromes

6. Finding URLs in text -- ...

7. Extract the Korean ...

8. Validate Domain Names

9. Validating email ...

10. Remove any HTML




Privacy Policy | Email Opt-out | Feedback | Syndication
© 2006 ActiveState Software Inc. All rights reserved.