1 I modified the script a bit so categories and tags would actually show up in the output file.
9 Wordpress-to-Ikiwiki import tool
12 Copyright (C) 2007 Chris Lamb <chris@chris-lamb.co.uk>
14 This program is free software: you can redistribute it and/or modify
15 it under the terms of the GNU General Public License as published by
16 the Free Software Foundation, either version 3 of the License, or
17 (at your option) any later version.
19 This program is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 GNU General Public License for more details.
24 You should have received a copy of the GNU General Public License
25 along with this program. If not, see <http://www.gnu.org/licenses/>.
27 Usage: run --help as an argument with this script.
30 I added some extra bits to include the [[!tag foo]] stuff in the post,
31 as it wasn't before, at all. I'll diff the versions out so you can see
40 from BeautifulSoup import BeautifulSoup
42 import codecs, htmlentitydefs
44 codecs.register_error('html_replace', lambda x: (''.join([u'&%s;' \
45 % htmlentitydefs.codepoint2name[ord(c)] for c in x.object[x.start:x.end]]), x.end))
47 def main(name, email, subdir, branch='master'):
48 soup = BeautifulSoup(sys.stdin.read())
50 # Regular expression to match stub in URL.
51 stub_pattern = re.compile(r'.*\/(.+)\/$')
53 for x in soup.findAll('item'):
55 if x.find('wp:status').string != 'publish': continue
57 match = stub_pattern.match(x.guid.string)
59 stub = match.groups()[0]
61 # Fall back to our own stubs
62 stub = re.sub(r'[^a-zA-Z0-9_]', '-', x.title.string).lower()
64 commit_msg = """Importing WordPress post "%s" [%s]""" % (x.title.string, x.guid.string)
65 timestamp = time.mktime(time.strptime(x.find('wp:post_date_gmt').string, "%Y-%m-%d %H:%M:%S"))
67 content = '[[!meta title="%s"]]\n\n' % (x.title.string.replace('"', r'\"'))
68 content += x.find('content:encoded').string.replace('\r\n', '\n')
70 # categories = x.findAll('category')
71 # categories = x.findAll({'category':True}, attrs={'domain':re.compile(('category|tag'))})
72 # categories = x.findAll({'category':True}, domain=["category", "tag"])
73 # categories = x.findAll({'category':True}, nicename=True)
75 We do it differently here because we have duplicates otherwise.
77 <category><![CDATA[Health]]></category>
78 <category domain="category" nicename="health"><![CDATA[Health]]></category>
80 If we do the what original did, we end up with all tags and cats doubled.
81 Therefore we only pick out nicename="foo". Our 'True' below is our 'foo'.
82 I'd much rather have the value of 'nicename', and tried, but my
83 python skillz are extremely limited....
85 categories = x.findAll('category', nicename=True)
88 for cat in categories:
89 # remove 'tags/' because we have a 'tagbase' set.
90 # your choice: 'tag', or 'taglink'
91 # content += "\n[[!tag %s]]" % (cat.string.replace(' ', '-'))
92 content += "\n[[!taglink %s]]" % (cat.string.replace(' ', '-'))
93 # print >>sys.stderr, cat.string.replace(' ', '-')
95 # moved this thing down
96 data = content.encode('ascii', 'html_replace')
97 print "commit refs/heads/%s" % branch
98 print "committer %s <%s> %d +0000" % (name, email, timestamp)
99 print "data %d" % len(commit_msg)
101 print "M 644 inline %s" % os.path.join(subdir, "%s.mdwn" % stub)
102 print "data %d" % len(data)
105 if __name__ == "__main__":
106 if len(sys.argv) not in (4, 5):
107 print >>sys.stderr, "%s: usage: %s name email subdir [branch] < wordpress-export.xml | git-fast-import " % (sys.argv[0], sys.argv[0])