Add pandoc conversion

This commit is contained in:
Alexander Nozik 2023-06-16 20:16:22 +03:00
parent 62f6436cf6
commit 3c1a1bd99d
4 changed files with 86 additions and 9 deletions

53
src/main/kotlin/html.kt Normal file
View File

@ -0,0 +1,53 @@
package ru.mipt.npm.space.documentextractor
import java.nio.file.Path
import java.nio.file.StandardOpenOption
import java.nio.file.attribute.FileAttribute
import kotlin.io.path.*
@OptIn(ExperimentalPathApi::class)
fun generateHtml(inputPath: Path, outputPath: Path) {
val scriptPath = inputPath.resolveSibling("scripts").resolve("links-to-html.lua")
if(!scriptPath.exists()) {
scriptPath.parent.createDirectories()
scriptPath.writeText(
{}.javaClass.getResource("/links-to-html.lua")!!.readText(),
Charsets.UTF_8,
StandardOpenOption.CREATE
)
}
inputPath.copyToRecursively(outputPath, followLinks = false) { source: Path, target: Path ->
if (source.isRegularFile() && source.extension == "md") {
val targetPath = outputPath.resolve(source.fileName.nameWithoutExtension + ".html")
ProcessBuilder(
"pandoc",
"--standalone",
"--mathjax",
"--metadata=title: ${source.nameWithoutExtension}",
"--from=markdown",
"--to=html5",
"--lua-filter=${scriptPath.absolute()}",
"--output=${targetPath.absolute()}",
"${source.absolute()}",
).also {
logger.info("Running pandoc: ${it.command().joinToString(separator = " ")}")
}.inheritIO().start().waitFor()
CopyActionResult.CONTINUE
} else {
source.copyToIgnoringExistingDirectory(target, false)
}
}
// ZipOutputStream(zipFileName.outputStream().buffered()).use { zipStream ->
// outputPath.walk().forEach { file ->
// val zipEntryPath = file.absolute().relativize(inputPath.absolute())
// val entry = ZipEntry("$zipEntryPath${(if (file.isDirectory()) "/" else "")}")
// zipStream.putNextEntry(entry)
// if (file.isRegularFile()) {
// file.inputStream().copyTo(zipStream)
// }
// }
// }
}

View File

@ -3,6 +3,7 @@ package ru.mipt.npm.space.documentextractor
import io.ktor.client.engine.cio.CIO
import kotlinx.cli.ArgParser
import kotlinx.cli.ArgType
import kotlinx.cli.default
import kotlinx.cli.required
import kotlinx.coroutines.coroutineScope
import space.jetbrains.api.runtime.SpaceAppInstance
@ -29,13 +30,26 @@ suspend fun main(args: Array<String>) {
description = "The key of the exported project"
).required()
val path: String? by parser.option(ArgType.String, description = "Target directory. Default is './output/project-key'.")
val path: String? by parser.option(
ArgType.String,
description = "Target directory. Default is './output/project-key'."
)
val folderId: String? by parser.option(
ArgType.String,
description = "FolderId for the folder to export. By default uses project root."
)
val generateHtml by parser.option(
ArgType.Boolean,
description = "If defined, generate HTML directory and zip"
).default(false)
val htmlOutputPath by parser.option(
ArgType.String,
description = "Path for html output directory sibling to 'output' directory"
).default("html")
val clientId by parser.option(
ArgType.String,
description = "Space application client ID (if not defined, use environment value 'space.clientId')"
@ -48,7 +62,7 @@ suspend fun main(args: Array<String>) {
parser.parse(args)
val target: Path = path?.let { Path(it) } ?: Path.of("output/$project")
val target: Path = path?.let { Path(it) } ?: folderId?.let { Path("output") } ?: Path("output/$project")
Files.createDirectories(target)
@ -69,5 +83,8 @@ suspend fun main(args: Array<String>) {
ProjectIdentifier.Key(project),
folderId?.let { FolderIdentifier.Id(it) } ?: FolderIdentifier.Root
)
if (generateHtml) {
generateHtml(target, target.resolveSibling(htmlOutputPath))
}
}
}

View File

@ -3,7 +3,6 @@ package ru.mipt.npm.space.documentextractor
import io.ktor.client.request.header
import io.ktor.client.request.request
import io.ktor.client.request.url
import io.ktor.client.statement.HttpResponse
import io.ktor.client.statement.bodyAsChannel
import io.ktor.client.statement.readBytes
import io.ktor.http.HttpHeaders
@ -23,7 +22,7 @@ import java.nio.file.Path
import kotlin.io.path.*
import kotlin.streams.toList
private val logger by lazy { LoggerFactory.getLogger("space-extractor") }
internal val logger by lazy { LoggerFactory.getLogger("space-extractor") }
/**
* Extract single attachment image
@ -70,18 +69,19 @@ private val regex = """!\[(?<alt>.*)]\(/d/(?<id>.*)\?f=0""".toRegex()
/**
* Post-process a markdown document by downloading images and replacing links
*/
internal suspend fun SpaceClient.processMarkdownDocument(path: Path) = coroutineScope{
internal suspend fun SpaceClient.processMarkdownDocument(path: Path) = coroutineScope {
val documentBody = path.readText()
val logger = LoggerFactory.getLogger("space-document-extractor")
logger.info("Processing file $path...")
val newText = documentBody.replace(regex) {
val id = it.groups["id"]?.value ?: error("Unexpected reference format: ${it.value}")
val alt = it.groups["alt"]?.value
logger.info("Downloading image $id as images/$id")
val imageName = alt ?: id
logger.info("Downloading image $id as images/$imageName")
launch(Dispatchers.IO) {
extractImage(path.parent, id, id)
extractImage(path.parent, id, imageName)
}
"![$alt](images/$id"
"![$alt](images/$imageName"
}
path.writeText(newText)
}
@ -117,10 +117,12 @@ internal suspend fun SpaceClient.downloadDocument(
extractFile(directory, document.id, document.title)
}
}
is TextDocument -> {
val markdownFilePath = directory.resolve(document.title + ".md")
markdownFilePath.writeText(body.text, Charsets.UTF_8)
}
else -> {
LoggerFactory.getLogger("space-extractor")
.warn("Can't extract document ${document.title} with type ${document.bodyType}")
@ -165,7 +167,7 @@ suspend fun SpaceClient.downloadAndProcessDocumentsInProject(
directory: Path,
projectId: ProjectIdentifier,
rootFolder: FolderIdentifier = FolderIdentifier.Root,
) = withContext(Dispatchers.IO){
) = withContext(Dispatchers.IO) {
logger.info("Processing project ${projectId.compactId} to $directory")
downloadDocumentFolder(directory, projectId, rootFolder)
processMarkdownInDirectory(directory)

View File

@ -0,0 +1,5 @@
# links-to-html.lua
function Link(el)
el.target = string.gsub(el.target, "%.md", ".html")
return el
end