doc/tips/importing_posts_from_wordpress/ikiwiki-wordpress-import.mdwn

   1 I modified the script a bit so categories and tags would actually show up in the output file.
   2
   3
   4 <pre>
   5 #!/usr/bin/env python
   6
   7 """
   8     Purpose:
   9     Wordpress-to-Ikiwiki import tool
  10
  11     Copyright:
  12     Copyright (C) 2007  Chris Lamb <chris@chris-lamb.co.uk>
  13
  14     This program is free software: you can redistribute it and/or modify
  15     it under the terms of the GNU General Public License as published by
  16     the Free Software Foundation, either version 3 of the License, or
  17     (at your option) any later version.
  18
  19     This program is distributed in the hope that it will be useful,
  20     but WITHOUT ANY WARRANTY; without even the implied warranty of
  21     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  22     GNU General Public License for more details.
  23
  24     You should have received a copy of the GNU General Public License
  25     along with this program.  If not, see <http://www.gnu.org/licenses/>.
  26
  27     Usage: run --help as an argument with this script.
  28
  29     Notes:
  30     I added some extra bits to include the [[!tag foo]] stuff in the post,
  31     as it wasn't before, at all. I'll diff the versions out so you can see
  32     the mess I made :).
  33
  34 """
  35
  36 import os, sys
  37 import time
  38 import re
  39
  40 from BeautifulSoup import BeautifulSoup
  41
  42 import codecs, htmlentitydefs
  43
  44 codecs.register_error('html_replace', lambda x: (''.join([u'&%s;' \
  45     % htmlentitydefs.codepoint2name[ord(c)] for c in x.object[x.start:x.end]]), x.end))
  46
  47 def main(name, email, subdir, branch='master'):
  48     soup = BeautifulSoup(sys.stdin.read())
  49
  50     # Regular expression to match stub in URL.
  51     stub_pattern = re.compile(r'.*\/(.+)\/$')
  52
  53     for x in soup.findAll('item'):
  54         # Ignore draft posts
  55         if x.find('wp:status').string != 'publish': continue
  56
  57         match = stub_pattern.match(x.guid.string)
  58         if match:
  59             stub = match.groups()[0]
  60         else:
  61             # Fall back to our own stubs
  62             stub = re.sub(r'[^a-zA-Z0-9_]', '-', x.title.string).lower()
  63
  64         commit_msg = """Importing WordPress post "%s" [%s]""" % (x.title.string, x.guid.string)
  65         timestamp = time.mktime(time.strptime(x.find('wp:post_date_gmt').string, "%Y-%m-%d %H:%M:%S"))
  66
  67         content = '[[!meta title="%s"]]\n\n' % (x.title.string.replace('"', r'\"'))
  68         content += x.find('content:encoded').string.replace('\r\n', '\n')
  69
  70         # categories = x.findAll('category')
  71         # categories = x.findAll({'category':True}, attrs={'domain':re.compile(('category|tag'))})
  72         # categories = x.findAll({'category':True}, domain=["category", "tag"])
  73         # categories = x.findAll({'category':True}, nicename=True)
  74         """
  75         We do it differently here because we have duplicates otherwise.
  76         Take a look:
  77         <category><![CDATA[Health]]></category>
  78         <category domain="category" nicename="health"><![CDATA[Health]]></category>
  79
  80         If we do the what original did, we end up with all tags and cats doubled.
  81         Therefore we only pick out nicename="foo". Our 'True' below is our 'foo'.
  82         I'd much rather have the value of 'nicename', and tried, but my
  83         python skillz are extremely limited....
  84         """
  85         categories = x.findAll('category', nicename=True)
  86         if categories:
  87             content += "\n"
  88             for cat in categories:
  89                 # remove 'tags/' because we have a 'tagbase' set.
  90                 # your choice: 'tag', or 'taglink'
  91                 # content += "\n[[!tag %s]]" % (cat.string.replace(' ', '-'))
  92                 content += "\n[[!taglink %s]]" % (cat.string.replace(' ', '-'))
  93                 # print >>sys.stderr, cat.string.replace(' ', '-')
  94
  95         # moved this thing down
  96         data = content.encode('ascii', 'html_replace')
  97         print "commit refs/heads/%s" % branch
  98         print "committer %s <%s> %d +0000" % (name, email, timestamp)
  99         print "data %d" % len(commit_msg)
 100         print commit_msg
 101         print "M 644 inline %s" % os.path.join(subdir, "%s.mdwn" % stub)
 102         print "data %d" % len(data)
 103         print data
 104
 105 if __name__ == "__main__":
 106     if len(sys.argv) not in (4, 5):
 107         print >>sys.stderr, "%s: usage: %s name email subdir [branch] < wordpress-export.xml | git-fast-import " % (sys.argv[0], sys.argv[0])
 108     else:
 109         main(*sys.argv[1:])
 110
 111 </pre>