]> sipb.mit.edu Git - ikiwiki.git/blobdiff - IkiWiki/Plugin/htmlscrubber.pm
po: added postscan hook, to make pages depend on the pages linking to them
[ikiwiki.git] / IkiWiki / Plugin / htmlscrubber.pm
index 897a398bae7cfe1d87b1b1b618857a407afa10f7..823b3d806cf1c51ae2eec5b715ff2991f7c79b3d 100644 (file)
@@ -5,23 +5,18 @@ use warnings;
 use strict;
 use IkiWiki 2.00;
 
 use strict;
 use IkiWiki 2.00;
 
-sub import { #{{{
-       hook(type => "sanitize", id => "htmlscrubber", call => \&sanitize);
-} # }}}
+# This regexp matches urls that are in a known safe scheme.
+# Feel free to use it from other plugins.
+our $safe_url_regexp;
 
 
-sub sanitize (@) { #{{{
-       my %params=@_;
-       return scrubber()->scrub($params{content});
-} # }}}
+sub import {
+       hook(type => "getsetup", id => "htmlscrubber", call => \&getsetup);
+       hook(type => "sanitize", id => "htmlscrubber", call => \&sanitize);
 
 
-my $_scrubber;
-sub scrubber { #{{{
-       return $_scrubber if defined $_scrubber;
-       
        # Only known uri schemes are allowed to avoid all the ways of
        # embedding javascrpt.
        # List at http://en.wikipedia.org/wiki/URI_scheme
        # Only known uri schemes are allowed to avoid all the ways of
        # embedding javascrpt.
        # List at http://en.wikipedia.org/wiki/URI_scheme
-       my $uri_schemes=join("|",
+       my $uri_schemes=join("|", map quotemeta,
                # IANA registered schemes
                "http", "https", "ftp", "mailto", "file", "telnet", "gopher",
                "aaa", "aaas", "acap",  "cap", "cid", "crid", 
                # IANA registered schemes
                "http", "https", "ftp", "mailto", "file", "telnet", "gopher",
                "aaa", "aaas", "acap",  "cap", "cid", "crid", 
@@ -33,11 +28,45 @@ sub scrubber { #{{{
                "aim", "callto", "cvs", "ed2k", "feed", "fish", "gg",
                "irc", "ircs", "lastfm", "ldaps", "magnet", "mms",
                "msnim", "notes", "rsync", "secondlife", "skype", "ssh",
                "aim", "callto", "cvs", "ed2k", "feed", "fish", "gg",
                "irc", "ircs", "lastfm", "ldaps", "magnet", "mms",
                "msnim", "notes", "rsync", "secondlife", "skype", "ssh",
-               "sftp", "sms", "steam", "webcal", "ymsgr",
+               "sftp", "smb", "sms", "snews", "webcal", "ymsgr",
        );
        # data is a special case. Allow data:image/*, but
        # disallow data:text/javascript and everything else.
        );
        # data is a special case. Allow data:image/*, but
        # disallow data:text/javascript and everything else.
-       my $link=qr/^(?:(?:$uri_schemes):|data:image\/|[^:]+$)/i;
+       $safe_url_regexp=qr/^(?:(?:$uri_schemes):|data:image\/|[^:]+(?:$|\/))/i;
+}
+
+sub getsetup () {
+       return
+               plugin => {
+                       safe => 1,
+                       rebuild => undef,
+               },
+               htmlscrubber_skip => {
+                       type => "pagespec",
+                       example => "!*/Discussion",
+                       description => "PageSpec specifying pages not to scrub",
+                       link => "ikiwiki/PageSpec",
+                       safe => 1,
+                       rebuild => undef,
+               },
+}
+
+sub sanitize (@) {
+       my %params=@_;
+
+       if (exists $config{htmlscrubber_skip} &&
+           length $config{htmlscrubber_skip} &&
+           exists $params{destpage} &&
+           pagespec_match($params{destpage}, $config{htmlscrubber_skip})) {
+               return $params{content};
+       }
+
+       return scrubber()->scrub($params{content});
+}
+
+my $_scrubber;
+sub scrubber {
+       return $_scrubber if defined $_scrubber;
 
        eval q{use HTML::Scrubber};
        error($@) if $@;
 
        eval q{use HTML::Scrubber};
        error($@) if $@;
@@ -58,27 +87,30 @@ sub scrubber { #{{{
                        map { $_ => 1 } qw{
                                abbr accept accept-charset accesskey
                                align alt axis border cellpadding cellspacing
                        map { $_ => 1 } qw{
                                abbr accept accept-charset accesskey
                                align alt axis border cellpadding cellspacing
-                               char charoff charset checked cite class
+                               char charoff charset checked class
                                clear cols colspan color compact coords
                                datetime dir disabled enctype for frame
                                headers height hreflang hspace id ismap
                                clear cols colspan color compact coords
                                datetime dir disabled enctype for frame
                                headers height hreflang hspace id ismap
-                               label lang longdesc maxlength media method
+                               label lang maxlength media method
                                multiple name nohref noshade nowrap prompt
                                readonly rel rev rows rowspan rules scope
                                selected shape size span start summary
                                multiple name nohref noshade nowrap prompt
                                readonly rel rev rows rowspan rules scope
                                selected shape size span start summary
-                               tabindex target title type usemap valign
+                               tabindex target title type valign
                                value vspace width
                                autoplay loopstart loopend end
                                playcount controls 
                        } ),
                        "/" => 1, # emit proper <hr /> XHTML
                                value vspace width
                                autoplay loopstart loopend end
                                playcount controls 
                        } ),
                        "/" => 1, # emit proper <hr /> XHTML
-                       href => $link,
-                       src => $link,
-                       action => $link,
-                       poster => $link,
+                       href => $safe_url_regexp,
+                       src => $safe_url_regexp,
+                       action => $safe_url_regexp,
+                       cite => $safe_url_regexp,
+                       longdesc => $safe_url_regexp,
+                       poster => $safe_url_regexp,
+                       usemap => $safe_url_regexp,
                }],
        );
        return $_scrubber;
                }],
        );
        return $_scrubber;
-} # }}}
+}
 
 1
 
 1