haiku-website/scripts/drupal2hugo.js

304 lines
8.4 KiB
JavaScript

/*
* Drupal 6 CSV dump->Hugo
* (C) 2016 Augustin Cavalier <waddlesplash>
*
* NOTE: This script requires about 3GB of RAM. nodejs limits you to 1.5GB by default.
* Override it by running this script with "node --max_old_space_size=4096 drupal2hugo.js".
*
* NOTE: CSV dumps generated using:
* sudo mkdir /tmp/mysql_dump_dir/
* sudo chmod u=rwx,g=rwx,o=rwx /tmp/mysql_dump_dir/
* sudo mysqldump --fields-enclosed-by='"' --fields-terminated-by=',' --tab /tmp/mysql_dump_dir/ website_livesite
*
* NOTE: The only CSV files you need from the dump are:
* node.txt, node_revisions.txt, url_alias.txt, term_data.txt, term_node.txt
*/
// Set this to `true` if you want to skip export of files that already exist.
// If false, it will abort with an error if a file already exists.
// BE VERY CAREFUL WHEN SETTING THIS TO TRUE, STRANGE THINGS MAY HAPPEN.
// DO NOT SET TO "true" ON FIRST RUN.
const SKIP_EXISTING_FILES = true;
var fs = require('fs');
function ParseCSV(csv) {
var rows = [];
for (var i = 0; i < csv.length; i++) {
var row = [];
for (; i < csv.length; i++) {
if (csv[i] == '"') {
i++;
var itm = '', done = false;
while (!done) {
switch (csv[i]) {
case '"':
done = true;
break;
case '\\':
i++;
switch (csv[i]) {
case "r":
itm += "\r";
break;
case "n":
itm += "\n";
break;
case "t":
itm += "\t";
break;
case '"':
itm += '"';
break;
case "\r":
itm += "\r";
break;
case "\n":
itm += "\n";
break;
case "\\":
itm += "\\";
break;
default:
console.log("WARN: unknown escape sequence: \\" + csv[i] + ", row[0] = ", row[0]);
itm += csv[i];
break;
}
break;
default:
itm += csv[i];
break;
}
i++;
}
i--;
row.push(itm);
} else if (csv[i] == "\n") {
break;
}
}
rows.push(row);
}
return rows;
}
var nodes = ParseCSV(fs.readFileSync('export/node.txt', {encoding: "UTF-8"})),
node_revisions = ParseCSV(fs.readFileSync('export/node_revisions.txt', {encoding: "UTF-8"})),
url_alias = ParseCSV(fs.readFileSync('export/url_alias.txt', {encoding: "UTF-8"}));
var term_data = ParseCSV(fs.readFileSync('export/term_data.txt', {encoding: "UTF-8"})),
term_node = ParseCSV(fs.readFileSync('export/term_node.txt', {encoding: "UTF-8"})),
TagMap = {};
// Create TagMap
for (var i in term_data)
TagMap[term_data[i][0]] = term_data[i][2];
term_data = [];
var header_template =
`+++
type = "TYPE"
title = "TITLE"
date = "DATE"
tags = [TAGS]
+++`;
// author = "AUTHOR"
var base = "newsite/content";
function GetSavePath(path, type, node, ext) {
if (!ext) {
console.log("No extension for node ", node);
process.exit(1);
}
var countSlash = (path.match(/\//g) || []).length;
if ((type == "blog" && countSlash == 3) ||
(type == "content_news" && countSlash == 2)) {
// Some old blog content had a "/" between the date and the post title
// So we replace the last '/' with an '_'.
path = path.substr(0, path.lastIndexOf("/")) + "_" + path.substr(path.lastIndexOf("/") + 1);
}
path = path.split("/");
var ret = base;
for (var i in path) {
ret += "/" + path[i];
if (fs.existsSync(ret)) {
// Path exists. Are we at the end?
if (i == (path.length - 1)) {
// Check if there already is an index file.
ret += "/index";
if (fs.existsSync(ret + ".txt") || fs.existsSync(ret + ".md") || fs.existsSync(ret + ".html")) {
if (SKIP_EXISTING_FILES) {
// Assume the file was created in a previous run, skip export.
return false;
} else {
console.error("FATAL: could not find unused path for", path.join('/'), node);
process.exit(1);
}
} else { // Nope, doesn't exist, so let's use it.
ret += ext;
break;
}
} else // We aren't at the end, just continue.
continue;
} else {
// Path does not exist. Are we at the end?
if (i == (path.length - 1)) {
// This is it.
if (fs.existsSync(ret + ".txt") || fs.existsSync(ret + ".md") || fs.existsSync(ret + ".html")) {
if (SKIP_EXISTING_FILES) {
// Assume the file was created in a previous run, skip export.
return false;
} else {
console.error("FATAL: file already exists for", path.join('/'), node);
process.exit(1);
}
}
ret += ext;
break;
} else {
// We aren't at the end, and the path does not exist. Create a directory.
fs.mkdirSync(ret);
// Is there a file with the name of the folder we just made?
// If so, make it the index.html of the subdirectory.
if (fs.existsSync(ret + ".html"))
fs.renameSync(ret + ".html", ret + "/index.html");
else if (fs.existsSync(ret + ".md"))
fs.renameSync(ret + ".md", ret + "/index.md");
else if (fs.existsSync(ret + ".txt"))
fs.renameSync(ret + ".txt", ret + "/index.txt");
}
}
}
return ret;
}
/* "format" types are apparently:
* 2: PHP code
* 3: Full HTML
* 5: Text w/limited HTML
* 8: Text w/very limited HTML
* 10: Plain text
*/
var FormatMap = {
"0": ".html",
"2": ".html",
"3": ".html",
"5": ".md",
"8": ".md",
"10": ".txt"
};
var hasPhpCode = [];
for (var i in nodes) {
if (nodes[i][2] == "forum")
continue;
var node = nodes[i];
var nid = node[0],
type = node[2],
title = node[3],
isPublished = (node[5] == "1"),
created = node[6],
changed = node[7],
content = undefined,
url_dst = undefined,
format = undefined,
tags = [];
if (!isPublished) {
console.log("INFO: skipping nid" + nid + ", unpublished");
continue;
}
for (var i in node_revisions) {
if (node_revisions[i][0] != nid)
continue;
if (node_revisions[i][7] != changed)
continue;
content = node_revisions[i][4];
format = node_revisions[i][8];
}
if (!content) {
console.log("WARN: could not find content for nid" + nid);
continue;
}
var url_src = 'node/' + nid;
for (var i in url_alias) {
if (url_alias[i][1] != url_src)
continue;
url_dst = url_alias[i][2];
break;
}
if (!url_dst) {
// If it's a news article, try to invent a URL
if (type == "news" || type == "content_news") {
url_dst = "news/" + (new Date(parseInt(created) * 1000)).toISOString().substr(0, 10) + "_" +
title.toLowerCase().replace(/\ /g, "_");
console.log("WARN: No url_dst for nid" + nid + ", invented one");
} else
url_dst = url_src;
}
for (var i in term_node) {
if (term_node[i][0] != nid)
continue;
var tag = TagMap[term_node[i][1]];
if (tags.indexOf(tag) == -1)
tags.push(tag);
}
var outfile = header_template + '\n\n', realtype = '';
if (type == "blog")
realtype = "blog";
else if (type == "content_news" || url_dst.indexOf("news/") != -1)
realtype = "news";
else
realtype = "article";
outfile = outfile
.replace("TYPE", realtype)
.replace("TITLE", title.replace(/"/g, '\\"'))
.replace("DATE", (new Date(parseInt(created) * 1000)).toISOString())
.replace("TAGS", tags.length ? ('"' + tags.join('", "') + '"') : '');
if (type == "blog") {
var blogAuthor = url_dst.split('/')[1];
if (blogAuthor == "pulkomandy") blogAuthor = "PulkoMandy"; // HACK
if (blogAuthor == "barrett") blogAuthor = "Barrett"; // HACK
outfile = outfile.replace("\ntitle", '\nauthor = "' + blogAuthor + '"\ntitle');
}
content = content.replace(/\r\n/g, "\n").replace(/\r/g, "\n");
// Try to clean up some of the CSS classes.
content = content.replace(/\"box\-stop\"/g, '"alert alert-danger"');
content = content.replace(/\"box\-info\"/g, '"alert alert-info"');
content = content.replace(/\"box\-warning\"/g, '"alert alert-warning"');
// Clean up <code> tags.
for (var i = 0; i < content.length; i++) {
i = content.indexOf("<code>", i);
if (i == -1)
break;
// See if there is a linebreak before the </code>.
var j = content.indexOf("</code>", i), hasBreak = false;
for (var p = i; p < j; p++) {
if (content[p] == "\n") {
hasBreak = true;
break;
}
}
if (hasBreak) {
content = content.substr(0, i) + "<pre>" + content.substr(i + 6);
content = content.substr(0, j - 1) + "</pre>" + content.substr((j - 1) + 7);
}
}
outfile += content;
var file = GetSavePath(url_dst, type, node, FormatMap[format]);
if (file !== false) {
console.log("INFO: writing (nid" + nid + ")", file);
if (format == "2")
hasPhpCode.push(file);
fs.writeFileSync(file, outfile);
}
}
console.log("");
console.log("The following pages have PHP code in them and should be audited:", hasPhpCode);