/* * Drupal 6 CSV dump->Hugo * (C) 2016 Augustin Cavalier * * NOTE: This script requires about 3GB of RAM. nodejs limits you to 1.5GB by default. * Override it by running this script with "node --max_old_space_size=4096 drupal2hugo.js". * * NOTE: CSV dumps generated using: * sudo mkdir /tmp/mysql_dump_dir/ * sudo chmod u=rwx,g=rwx,o=rwx /tmp/mysql_dump_dir/ * sudo mysqldump --fields-enclosed-by='"' --fields-terminated-by=',' --tab /tmp/mysql_dump_dir/ website_livesite * * NOTE: The only CSV files you need from the dump are: * node.txt, node_revisions.txt, url_alias.txt, term_data.txt, term_node.txt */ // Set this to `true` if you want to skip export of files that already exist. // If false, it will abort with an error if a file already exists. // BE VERY CAREFUL WHEN SETTING THIS TO TRUE, STRANGE THINGS MAY HAPPEN. // DO NOT SET TO "true" ON FIRST RUN. const SKIP_EXISTING_FILES = true; var fs = require('fs'); function ParseCSV(csv) { var rows = []; for (var i = 0; i < csv.length; i++) { var row = []; for (; i < csv.length; i++) { if (csv[i] == '"') { i++; var itm = '', done = false; while (!done) { switch (csv[i]) { case '"': done = true; break; case '\\': i++; switch (csv[i]) { case "r": itm += "\r"; break; case "n": itm += "\n"; break; case "t": itm += "\t"; break; case '"': itm += '"'; break; case "\r": itm += "\r"; break; case "\n": itm += "\n"; break; case "\\": itm += "\\"; break; default: console.log("WARN: unknown escape sequence: \\" + csv[i] + ", row[0] = ", row[0]); itm += csv[i]; break; } break; default: itm += csv[i]; break; } i++; } i--; row.push(itm); } else if (csv[i] == "\n") { break; } } rows.push(row); } return rows; } var nodes = ParseCSV(fs.readFileSync('export/node.txt', {encoding: "UTF-8"})), node_revisions = ParseCSV(fs.readFileSync('export/node_revisions.txt', {encoding: "UTF-8"})), url_alias = ParseCSV(fs.readFileSync('export/url_alias.txt', {encoding: "UTF-8"})); var term_data = ParseCSV(fs.readFileSync('export/term_data.txt', {encoding: "UTF-8"})), term_node = ParseCSV(fs.readFileSync('export/term_node.txt', {encoding: "UTF-8"})), TagMap = {}; // Create TagMap for (var i in term_data) TagMap[term_data[i][0]] = term_data[i][2]; term_data = []; var header_template = `+++ type = "TYPE" title = "TITLE" date = "DATE" tags = [TAGS] +++`; // author = "AUTHOR" var base = "newsite/content"; function GetSavePath(path, type, node, ext) { if (!ext) { console.log("No extension for node ", node); process.exit(1); } var countSlash = (path.match(/\//g) || []).length; if ((type == "blog" && countSlash == 3) || (type == "content_news" && countSlash == 2)) { // Some old blog content had a "/" between the date and the post title // So we replace the last '/' with an '_'. path = path.substr(0, path.lastIndexOf("/")) + "_" + path.substr(path.lastIndexOf("/") + 1); } path = path.split("/"); var ret = base; for (var i in path) { ret += "/" + path[i]; if (fs.existsSync(ret)) { // Path exists. Are we at the end? if (i == (path.length - 1)) { // Check if there already is an index file. ret += "/index"; if (fs.existsSync(ret + ".txt") || fs.existsSync(ret + ".md") || fs.existsSync(ret + ".html")) { if (SKIP_EXISTING_FILES) { // Assume the file was created in a previous run, skip export. return false; } else { console.error("FATAL: could not find unused path for", path.join('/'), node); process.exit(1); } } else { // Nope, doesn't exist, so let's use it. ret += ext; break; } } else // We aren't at the end, just continue. continue; } else { // Path does not exist. Are we at the end? if (i == (path.length - 1)) { // This is it. if (fs.existsSync(ret + ".txt") || fs.existsSync(ret + ".md") || fs.existsSync(ret + ".html")) { if (SKIP_EXISTING_FILES) { // Assume the file was created in a previous run, skip export. return false; } else { console.error("FATAL: file already exists for", path.join('/'), node); process.exit(1); } } ret += ext; break; } else { // We aren't at the end, and the path does not exist. Create a directory. fs.mkdirSync(ret); // Is there a file with the name of the folder we just made? // If so, make it the index.html of the subdirectory. if (fs.existsSync(ret + ".html")) fs.renameSync(ret + ".html", ret + "/index.html"); else if (fs.existsSync(ret + ".md")) fs.renameSync(ret + ".md", ret + "/index.md"); else if (fs.existsSync(ret + ".txt")) fs.renameSync(ret + ".txt", ret + "/index.txt"); } } } return ret; } /* "format" types are apparently: * 2: PHP code * 3: Full HTML * 5: Text w/limited HTML * 8: Text w/very limited HTML * 10: Plain text */ var FormatMap = { "0": ".html", "2": ".html", "3": ".html", "5": ".md", "8": ".md", "10": ".txt" }; var hasPhpCode = []; for (var i in nodes) { if (nodes[i][2] == "forum") continue; var node = nodes[i]; var nid = node[0], type = node[2], title = node[3], isPublished = (node[5] == "1"), created = node[6], changed = node[7], content = undefined, url_dst = undefined, format = undefined, tags = []; if (!isPublished) { console.log("INFO: skipping nid" + nid + ", unpublished"); continue; } for (var i in node_revisions) { if (node_revisions[i][0] != nid) continue; if (node_revisions[i][7] != changed) continue; content = node_revisions[i][4]; format = node_revisions[i][8]; } if (!content) { console.log("WARN: could not find content for nid" + nid); continue; } var url_src = 'node/' + nid; for (var i in url_alias) { if (url_alias[i][1] != url_src) continue; url_dst = url_alias[i][2]; break; } if (!url_dst) { // If it's a news article, try to invent a URL if (type == "news" || type == "content_news") { url_dst = "news/" + (new Date(parseInt(created) * 1000)).toISOString().substr(0, 10) + "_" + title.toLowerCase().replace(/\ /g, "_"); console.log("WARN: No url_dst for nid" + nid + ", invented one"); } else url_dst = url_src; } for (var i in term_node) { if (term_node[i][0] != nid) continue; var tag = TagMap[term_node[i][1]]; if (tags.indexOf(tag) == -1) tags.push(tag); } var outfile = header_template + '\n\n', realtype = ''; if (type == "blog") realtype = "blog"; else if (type == "content_news" || url_dst.indexOf("news/") != -1) realtype = "news"; else realtype = "article"; outfile = outfile .replace("TYPE", realtype) .replace("TITLE", title.replace(/"/g, '\\"')) .replace("DATE", (new Date(parseInt(created) * 1000)).toISOString()) .replace("TAGS", tags.length ? ('"' + tags.join('", "') + '"') : ''); if (type == "blog") { var blogAuthor = url_dst.split('/')[1]; if (blogAuthor == "pulkomandy") blogAuthor = "PulkoMandy"; // HACK if (blogAuthor == "barrett") blogAuthor = "Barrett"; // HACK outfile = outfile.replace("\ntitle", '\nauthor = "' + blogAuthor + '"\ntitle'); } content = content.replace(/\r\n/g, "\n").replace(/\r/g, "\n"); // Try to clean up some of the CSS classes. content = content.replace(/\"box\-stop\"/g, '"alert alert-danger"'); content = content.replace(/\"box\-info\"/g, '"alert alert-info"'); content = content.replace(/\"box\-warning\"/g, '"alert alert-warning"'); // Clean up tags. for (var i = 0; i < content.length; i++) { i = content.indexOf("", i); if (i == -1) break; // See if there is a linebreak before the . var j = content.indexOf("", i), hasBreak = false; for (var p = i; p < j; p++) { if (content[p] == "\n") { hasBreak = true; break; } } if (hasBreak) { content = content.substr(0, i) + "
" + content.substr(i + 6);
			content = content.substr(0, j - 1) + "
" + content.substr((j - 1) + 7); } } outfile += content; var file = GetSavePath(url_dst, type, node, FormatMap[format]); if (file !== false) { console.log("INFO: writing (nid" + nid + ")", file); if (format == "2") hasPhpCode.push(file); fs.writeFileSync(file, outfile); } } console.log(""); console.log("The following pages have PHP code in them and should be audited:", hasPhpCode);