Merge branch 'master' of ssh://git.ikiwiki.info/srv/git/ikiwiki.info

[ikiwiki.git] / doc / tips / importing_posts_from_wordpress / ikiwiki-wordpress-import.mdwn
diff --git a/doc/tips/importing_posts_from_wordpress/ikiwiki-wordpress-import.mdwn b/doc/tips/importing_posts_from_wordpress/ikiwiki-wordpress-import.mdwn

index 5d7a266ecac3419a5e98022afe5354ce7c761728..0c0527f2ce8e84a7d4aff0e1f9a85fb875ad64f5 100644 (file)
--- a/doc/tips/importing_posts_from_wordpress/ikiwiki-wordpress-import.mdwn
+++ b/doc/tips/importing_posts_from_wordpress/ikiwiki-wordpress-import.mdwn
@@ -2,7 +2,7 @@
  
  I modified the script a bit so categories and tags would actually show up in the output file.
  
-
+-----
  <pre>
  #!/usr/bin/env python
  
@@ -29,7 +29,7 @@ I modified the script a bit so categories and tags would actually show up in the
      Usage: run --help as an argument with this script.
  
      Notes:
-    I added some extra bits to include the [[!tag foo]] stuff in the post,
+    I added some extra bits to include the \[[!tag foo]] stuff in the post,
      as it wasn't before, at all. I'll diff the versions out so you can see
      the mess I made :).
  
@@ -66,7 +66,7 @@ def main(name, email, subdir, branch='master'):
          commit_msg = """Importing WordPress post "%s" [%s]""" % (x.title.string, x.guid.string)
          timestamp = time.mktime(time.strptime(x.find('wp:post_date_gmt').string, "%Y-%m-%d %H:%M:%S"))
  
-        content = '[[!meta title="%s"]]\n\n' % (x.title.string.replace('"', r'\"'))
+        content = '\[[!meta title="%s"]]\n\n' % (x.title.string.replace('"', r'\"'))
          content += x.find('content:encoded').string.replace('\r\n', '\n')
  
          # categories = x.findAll('category')
@@ -76,8 +76,120 @@ def main(name, email, subdir, branch='master'):
          """
          We do it differently here because we have duplicates otherwise.
          Take a look:
-        <category><![CDATA[Health]]></category>
-       <category domain="category" nicename="health"><![CDATA[Health]]></category>
+        &lt;category&gt;&lt;![CDATA[Health]]&gt;&lt;/category&gt;
+        &lt;category domain="category" nicename="health"&gt;&lt;![CDATA[Health]]&gt;&lt;/category&gt;
+
+        If we do the what original did, we end up with all tags and cats doubled.
+        Therefore we only pick out nicename="foo". Our 'True' below is our 'foo'.
+        I'd much rather have the value of 'nicename', and tried, but my
+        python skillz are extremely limited....
+        """
+        categories = x.findAll('category', nicename=True)
+        if categories:
+            content += "\n"
+            for cat in categories:
+                # remove 'tags/' because we have a 'tagbase' set.
+                # your choice: 'tag', or 'taglink'
+                # content += "\n\[[!tag %s]]" % (cat.string.replace(' ', '-'))
+                content += "\n\[[!taglink %s]]" % (cat.string.replace(' ', '-'))
+                # print >>sys.stderr, cat.string.replace(' ', '-')
+
+        # moved this thing down
+        data = content.encode('ascii', 'html_replace')
+        print "commit refs/heads/%s" % branch
+        print "committer %s &lt;%s&gt; %d +0000" % (name, email, timestamp)
+        print "data %d" % len(commit_msg)
+        print commit_msg
+        print "M 644 inline %s" % os.path.join(subdir, "%s.mdwn" % stub)
+        print "data %d" % len(data)
+        print data
+
+if __name__ == "__main__":
+    if len(sys.argv) not in (4, 5):
+        print >>sys.stderr, "%s: usage: %s name email subdir [branch] < wordpress-export.xml | git-fast-import " % (sys.argv[0], sys.argv[0])
+    else:
+        main(*sys.argv[1:])
+
+</pre>
+-----
+
+I have another version of the script, which uses the `timestamp` from the script, and inserts that as a \[[!meta date="foodate"]]. I'm posting it here just in case I happen to be doing something to the httpd.
+
+(Hopefully I've escaped everything properly; if I missed something, check the source.)
+
+-----
+<pre>
+#!/usr/bin/env python
+
+"""
+    Purpose:
+    Wordpress-to-Ikiwiki import tool
+
+    Copyright:
+    Copyright (C) 2007  Chris Lamb <chris@chris-lamb.co.uk>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+    Usage: run --help as an argument with this script.
+
+    Notes:
+    I added some extra bits to include the \[[!tag foo]] stuff in the post,
+    as it wasn't before, at all. I'll diff the versions out so you can see
+    the mess I made :).
+
+"""
+
+import os, sys
+import time
+import re
+
+from datetime import datetime
+from BeautifulSoup import BeautifulSoup
+
+import codecs, htmlentitydefs
+
+codecs.register_error('html_replace', lambda x: (''.join([u'&%s;' \
+    % htmlentitydefs.codepoint2name[ord(c)] for c in x.object[x.start:x.end]]), x.end))
+
+def main(name, email, subdir, branch='master'):
+    soup = BeautifulSoup(sys.stdin.read())
+
+    # Regular expression to match stub in URL.
+    stub_pattern = re.compile(r'.*\/(.+)\/$')
+
+    for x in soup.findAll('item'):
+        # Ignore draft posts
+        if x.find('wp:status').string != 'publish': continue
+
+        match = stub_pattern.match(x.guid.string)
+        if match:
+            stub = match.groups()[0]
+        else:
+            # Fall back to our own stubs
+            stub = re.sub(r'[^a-zA-Z0-9_]', '-', x.title.string).lower()
+
+        commit_msg = """Importing WordPress post "%s" [%s]""" % (x.title.string, x.guid.string)
+        timestamp = time.mktime(time.strptime(x.find('wp:post_date_gmt').string, "%Y-%m-%d %H:%M:%S"))
+        content = '\[[!meta title="%s"]]\n' % (x.title.string.replace('"', r'\"'))
+        content += "\[[!meta date=\"%s\"]]\n" % datetime.fromtimestamp(timestamp)
+        content += x.find('content:encoded').string.replace('\r\n', '\n')
+
+        """
+        We do it differently here because we have duplicates otherwise.
+        Take a look:
+        &lt;category&gt;&lt;![CDATA[Health]]&gt;&lt;/category&gt;
+        &lt;category domain="category" nicename="health"&gt;&lt;![CDATA[Health]]&gt;&lt;/category&gt;
  
          If we do the what original did, we end up with all tags and cats doubled.
          Therefore we only pick out nicename="foo". Our 'True' below is our 'foo'.
@@ -90,14 +202,15 @@ def main(name, email, subdir, branch='master'):
              for cat in categories:
                  # remove 'tags/' because we have a 'tagbase' set.
                  # your choice: 'tag', or 'taglink'
-                # content += "\n[[!tag %s]]" % (cat.string.replace(' ', '-'))
-                content += "\n[[!taglink %s]]" % (cat.string.replace(' ', '-'))
+                # content += "\n\[[!tag %s]]" % (cat.string.replace(' ', '-'))
+                content += "\n\[[!taglink %s]]" % (cat.string.replace(' ', '-'))
+                # this is just debugging, and for fun
                  # print >>sys.stderr, cat.string.replace(' ', '-')
  
          # moved this thing down
          data = content.encode('ascii', 'html_replace')
          print "commit refs/heads/%s" % branch
-        print "committer %s <%s> %d +0000" % (name, email, timestamp)
+        print "committer %s &lt;%s&gt; %d +0000" % (name, email, timestamp)
          print "data %d" % len(commit_msg)
          print commit_msg
          print "M 644 inline %s" % os.path.join(subdir, "%s.mdwn" % stub)
@@ -111,3 +224,10 @@ if __name__ == "__main__":
          main(*sys.argv[1:])
  
  </pre>
+-----
+
+
+[[!tag wordpress]]
+[[!tag python]]
+[[!tag conversion]]
+[[!tag ikiwiki]]