]> sipb.mit.edu Git - ikiwiki.git/blobdiff - IkiWiki/Plugin/htmlscrubber.pm
Allow colons in URLs after the first slash
[ikiwiki.git] / IkiWiki / Plugin / htmlscrubber.pm
index e02a8591ef2eac17d9d34eba4031bcdb7ce6b3ab..3bdaccea119ec0e1b289a0da2f6d90e2219b8d66 100644 (file)
@@ -5,23 +5,17 @@ use warnings;
 use strict;
 use IkiWiki 2.00;
 
 use strict;
 use IkiWiki 2.00;
 
+# This regexp matches urls that are in a known safe scheme.
+# Feel free to use it from other plugins.
+our $safe_url_regexp;
+
 sub import { #{{{
        hook(type => "sanitize", id => "htmlscrubber", call => \&sanitize);
 sub import { #{{{
        hook(type => "sanitize", id => "htmlscrubber", call => \&sanitize);
-} # }}}
-
-sub sanitize (@) { #{{{
-       my %params=@_;
-       return scrubber()->scrub($params{content});
-} # }}}
 
 
-my $_scrubber;
-sub scrubber { #{{{
-       return $_scrubber if defined $_scrubber;
-       
        # Only known uri schemes are allowed to avoid all the ways of
        # embedding javascrpt.
        # List at http://en.wikipedia.org/wiki/URI_scheme
        # Only known uri schemes are allowed to avoid all the ways of
        # embedding javascrpt.
        # List at http://en.wikipedia.org/wiki/URI_scheme
-       my $uri_schemes=join("|",
+       my $uri_schemes=join("|", map quotemeta,
                # IANA registered schemes
                "http", "https", "ftp", "mailto", "file", "telnet", "gopher",
                "aaa", "aaas", "acap",  "cap", "cid", "crid", 
                # IANA registered schemes
                "http", "https", "ftp", "mailto", "file", "telnet", "gopher",
                "aaa", "aaas", "acap",  "cap", "cid", "crid", 
@@ -33,11 +27,21 @@ sub scrubber { #{{{
                "aim", "callto", "cvs", "ed2k", "feed", "fish", "gg",
                "irc", "ircs", "lastfm", "ldaps", "magnet", "mms",
                "msnim", "notes", "rsync", "secondlife", "skype", "ssh",
                "aim", "callto", "cvs", "ed2k", "feed", "fish", "gg",
                "irc", "ircs", "lastfm", "ldaps", "magnet", "mms",
                "msnim", "notes", "rsync", "secondlife", "skype", "ssh",
-               "sftp", "sms", "steam", "webcal", "ymsgr",
+               "sftp", "smb", "sms", "snews", "webcal", "ymsgr",
        );
        # data is a special case. Allow data:image/*, but
        # disallow data:text/javascript and everything else.
        );
        # data is a special case. Allow data:image/*, but
        # disallow data:text/javascript and everything else.
-       my $link=qr/^(?:$uri_schemes:|data:image\/|[^:]+$)/i;
+       $safe_url_regexp=qr/^(?:(?:$uri_schemes):|data:image\/|[^:]+(?:$|\/))/i;
+} # }}}
+
+sub sanitize (@) { #{{{
+       my %params=@_;
+       return scrubber()->scrub($params{content});
+} # }}}
+
+my $_scrubber;
+sub scrubber { #{{{
+       return $_scrubber if defined $_scrubber;
 
        eval q{use HTML::Scrubber};
        error($@) if $@;
 
        eval q{use HTML::Scrubber};
        error($@) if $@;
@@ -58,24 +62,27 @@ sub scrubber { #{{{
                        map { $_ => 1 } qw{
                                abbr accept accept-charset accesskey
                                align alt axis border cellpadding cellspacing
                        map { $_ => 1 } qw{
                                abbr accept accept-charset accesskey
                                align alt axis border cellpadding cellspacing
-                               char charoff charset checked cite class
+                               char charoff charset checked class
                                clear cols colspan color compact coords
                                datetime dir disabled enctype for frame
                                headers height hreflang hspace id ismap
                                clear cols colspan color compact coords
                                datetime dir disabled enctype for frame
                                headers height hreflang hspace id ismap
-                               label lang longdesc maxlength media method
+                               label lang maxlength media method
                                multiple name nohref noshade nowrap prompt
                                readonly rel rev rows rowspan rules scope
                                selected shape size span start summary
                                multiple name nohref noshade nowrap prompt
                                readonly rel rev rows rowspan rules scope
                                selected shape size span start summary
-                               tabindex target title type usemap valign
+                               tabindex target title type valign
                                value vspace width
                                autoplay loopstart loopend end
                                playcount controls 
                        } ),
                        "/" => 1, # emit proper <hr /> XHTML
                                value vspace width
                                autoplay loopstart loopend end
                                playcount controls 
                        } ),
                        "/" => 1, # emit proper <hr /> XHTML
-                       href => $link,
-                       src => $link,
-                       action => $link,
-                       poster => $link,
+                       href => $safe_url_regexp,
+                       src => $safe_url_regexp,
+                       action => $safe_url_regexp,
+                       cite => $safe_url_regexp,
+                       longdesc => $safe_url_regexp,
+                       poster => $safe_url_regexp,
+                       usemap => $safe_url_regexp,
                }],
        );
        return $_scrubber;
                }],
        );
        return $_scrubber;