* htmlscrubber security fix: Block javascript in uris.
[ikiwiki.git] / IkiWiki / Plugin / htmlscrubber.pm
index 4245c879cdefa12f127306a0136fdfdc1d00164f..25caa8a506cdf5ab8ea0c9b0d3e9e935f62bd6f8 100644 (file)
@@ -18,40 +18,66 @@ my $_scrubber;
 sub scrubber { #{{{
        return $_scrubber if defined $_scrubber;
        
 sub scrubber { #{{{
        return $_scrubber if defined $_scrubber;
        
+       # Only known uri schemes are allowed to avoid all the ways of
+       # embedding javascrpt.
+       # List at http://en.wikipedia.org/wiki/URI_scheme
+       my $uri_schemes=join("|",
+               # IANA registered schemes
+               "http", "https", "ftp", "mailto", "file", "telnet", "gopher",
+               "aaa", "aaas", "acap",  "cap", "cid", "crid", 
+               "dav", "dict", "dns", "fax", "go", "h323", "im", "imap",
+               "ldap", "mid", "news", "nfs", "nntp", "pop", "pres",
+               "sip", "sips", "snmp", "tel", "urn", "wais", "xmpp",
+               "z39.50r", "z39.50s",
+               # data is a special case. Allow data:text/<image>, but
+               # disallow data:text/javascript and everything else.
+               qr/data:text\/(?:png|gif|jpeg)/,
+               # Selected unofficial schemes
+               "about", "aim", "callto", "cvs", "ed2k", "feed", "fish", "gg",
+               "irc", "ircs", "lastfm", "ldaps", "magnet", "mms",
+               "msnim", "notes", "rsync", "secondlife", "skype", "ssh",
+               "sftp", "sms", "steam", "webcal", "ymsgr",
+       );
+       my $link=qr/^(?:$uri_schemes:|[^:]+$)/i;
+
        eval q{use HTML::Scrubber};
        error($@) if $@;
        # Lists based on http://feedparser.org/docs/html-sanitization.html
        # With html 5 video and audio tags added.
        $_scrubber = HTML::Scrubber->new(
                allow => [qw{
        eval q{use HTML::Scrubber};
        error($@) if $@;
        # Lists based on http://feedparser.org/docs/html-sanitization.html
        # With html 5 video and audio tags added.
        $_scrubber = HTML::Scrubber->new(
                allow => [qw{
-                       a abbr acronym address area b big blockquote br
+                       a abbr acronym address area b big blockquote br br/
                        button caption center cite code col colgroup dd del
                        dfn dir div dl dt em fieldset font form h1 h2 h3 h4
                        button caption center cite code col colgroup dd del
                        dfn dir div dl dt em fieldset font form h1 h2 h3 h4
-                       h5 h6 hr i img input ins kbd label legend li map
-                       menu ol optgroup option p pre q s samp select small
+                       h5 h6 hr hr/ i img input ins kbd label legend li map
+                       menu ol optgroup option p p/ pre q s samp select small
                        span strike strong sub sup table tbody td textarea
                        tfoot th thead tr tt u ul var
                        video audio
                }],
                default => [undef, { (
                        map { $_ => 1 } qw{
                        span strike strong sub sup table tbody td textarea
                        tfoot th thead tr tt u ul var
                        video audio
                }],
                default => [undef, { (
                        map { $_ => 1 } qw{
-                               abbr accept accept-charset accesskey action
+                               abbr accept accept-charset accesskey
                                align alt axis border cellpadding cellspacing
                                char charoff charset checked cite class
                                clear cols colspan color compact coords
                                datetime dir disabled enctype for frame
                                align alt axis border cellpadding cellspacing
                                char charoff charset checked cite class
                                clear cols colspan color compact coords
                                datetime dir disabled enctype for frame
-                               headers height href hreflang hspace id ismap
+                               headers height hreflang hspace id ismap
                                label lang longdesc maxlength media method
                                multiple name nohref noshade nowrap prompt
                                readonly rel rev rows rowspan rules scope
                                label lang longdesc maxlength media method
                                multiple name nohref noshade nowrap prompt
                                readonly rel rev rows rowspan rules scope
-                               selected shape size span src start summary
+                               selected shape size span start summary
                                tabindex target title type usemap valign
                                value vspace width
                                tabindex target title type usemap valign
                                value vspace width
-                               poster autoplay loopstart loopend end
+                               autoplay loopstart loopend end
                                playcount controls 
                        } ),
                        "/" => 1, # emit proper <hr /> XHTML
                                playcount controls 
                        } ),
                        "/" => 1, # emit proper <hr /> XHTML
-                       }],
+                       href => $link,
+                       src => $link,
+                       action => $link,
+                       poster => $link,
+               }],
        );
        return $_scrubber;
 } # }}}
        );
        return $_scrubber;
 } # }}}