Use full link instead of fragments

This commit is contained in:
Alexander Nozik 2023-07-03 10:03:36 +03:00
parent 0d158269a0
commit da6ba9ab01
6 changed files with 190 additions and 80 deletions

View File

@ -7,7 +7,7 @@ plugins {
id("com.github.johnrengelman.shadow") version "8.1.1" id("com.github.johnrengelman.shadow") version "8.1.1"
} }
group = "ru.mipt.npm" group = "center.sciprog"
version = "1.0-SNAPSHOT" version = "1.0-SNAPSHOT"
repositories { repositories {
@ -15,9 +15,13 @@ repositories {
maven("https://maven.pkg.jetbrains.space/public/p/space/maven") maven("https://maven.pkg.jetbrains.space/public/p/space/maven")
} }
val ktorVersion = "2.3.1"
dependencies { dependencies {
implementation("org.jetbrains:space-sdk-jvm:159302-beta") implementation("org.jetbrains:space-sdk-jvm:163093-beta")
implementation("io.ktor:ktor-client-cio-jvm:2.3.1") implementation("io.ktor:ktor-client-cio-jvm:$ktorVersion")
implementation("io.ktor:ktor-server-core-jvm:$ktorVersion")
implementation("io.ktor:ktor-server-cio-jvm:$ktorVersion")
implementation("org.jetbrains.kotlinx:kotlinx-cli:0.3.5") implementation("org.jetbrains.kotlinx:kotlinx-cli:0.3.5")
implementation("ch.qos.logback:logback-classic:1.4.8") implementation("ch.qos.logback:logback-classic:1.4.8")
testImplementation(kotlin("test")) testImplementation(kotlin("test"))
@ -32,7 +36,7 @@ kotlin {
} }
application { application {
mainClass.set("ru.mipt.npm.space.documentextractor.MainKt") mainClass.set("center.sciprog.space.documentextractor.MainKt")
} }
tasks.withType<ShadowJar>{ tasks.withType<ShadowJar>{

View File

@ -1,11 +1,11 @@
package ru.mipt.npm.space.documentextractor package center.sciprog.space.documentextractor
import java.nio.file.Path import java.nio.file.Path
import java.nio.file.StandardOpenOption import java.nio.file.StandardOpenOption
import kotlin.io.path.* import kotlin.io.path.*
internal fun prepareScripts(inputPath: Path): Path { internal fun prepareScripts(outputPath: Path): Path {
val scriptPath = inputPath.resolveSibling("scripts").resolve("links-to-html.lua") val scriptPath = outputPath.resolveSibling("@scripts").resolve("links-to-html.lua")
if (!scriptPath.exists()) { if (!scriptPath.exists()) {
scriptPath.parent.createDirectories() scriptPath.parent.createDirectories()
scriptPath.writeText( scriptPath.writeText(
@ -18,27 +18,20 @@ internal fun prepareScripts(inputPath: Path): Path {
} }
/**
* Convert a directory of markdown files to docx, copying other files as is.
*/
@OptIn(ExperimentalPathApi::class) @OptIn(ExperimentalPathApi::class)
fun convert(inputPath: Path, outputPath: Path) { fun convertToHtml(inputPath: Path, outputPath: Path, indexFileName: String = "index") {
val scriptPath = prepareScripts(inputPath) val scriptPath = prepareScripts(outputPath)
inputPath.copyToRecursively(outputPath, followLinks = false) { source: Path, target: Path -> inputPath.copyToRecursively(outputPath, followLinks = false) { source: Path, target: Path ->
if (source.isRegularFile() && source.extension == "md") { if (source.isRegularFile() && source.extension == "md") {
val docxPath = target.parent.resolve(source.fileName.nameWithoutExtension + ".docx") val htmlFileName = source.fileName.nameWithoutExtension.let {
if (it == indexFileName) "index" else it
}
ProcessBuilder( val htmlPath = target.parent.resolve("$htmlFileName.html")
"pandoc",
"--from=markdown",
"--to=docx",
"--lua-filter=${scriptPath.absolute()}",
"--output=${docxPath.absolute()}",
"${source.absolute()}",
).also {
logger.info("Running pandoc: ${it.command().joinToString(separator = " ")}")
}.directory(source.parent.toFile()).inheritIO().start().waitFor()
val htmlPath = target.parent.resolve(source.fileName.nameWithoutExtension + ".html")
ProcessBuilder( ProcessBuilder(
"pandoc", "pandoc",
@ -50,10 +43,9 @@ fun convert(inputPath: Path, outputPath: Path) {
"--lua-filter=${scriptPath.absolute()}", "--lua-filter=${scriptPath.absolute()}",
"--output=${htmlPath.absolute()}", "--output=${htmlPath.absolute()}",
"${source.absolute()}", "${source.absolute()}",
).also { ).also {
logger.info("Running pandoc: ${it.command().joinToString(separator = " ")}") logger.info("Running pandoc: ${it.command().joinToString(separator = " ")}")
}.inheritIO().start().waitFor() }.directory(source.parent.toFile()).inheritIO().start().waitFor()
CopyActionResult.CONTINUE CopyActionResult.CONTINUE
@ -61,24 +53,33 @@ fun convert(inputPath: Path, outputPath: Path) {
source.copyToIgnoringExistingDirectory(target, false) source.copyToIgnoringExistingDirectory(target, false)
} }
} }
}
// inputPath.walk().filter { it.extension == "md" }.forEach { source -> /**
// * Convert a directory of markdown files to docx files, ignoring other files
// val docxPath = */
// (outputPath / source.relativize(inputPath)).resolveSibling(source.fileName.nameWithoutExtension + ".docx") @OptIn(ExperimentalPathApi::class)
// fun convertToDocX(inputPath: Path, outputPath: Path) {
// ProcessBuilder( inputPath.copyToRecursively(outputPath, followLinks = false) { source: Path, target: Path ->
// "pandoc", if (source.isRegularFile() && source.extension == "md") {
// "--from=markdown",
// "--to=docx", val docxPath = target.parent.resolve(source.fileName.nameWithoutExtension + ".docx")
// "--lua-filter=${scriptPath.absolute()}",
// "--output=${docxPath.absolute()}", ProcessBuilder(
// "${source.absolute()}", "pandoc",
// "--standalone",
// ).also { "--from=markdown",
// logger.info("Running pandoc: ${it.command().joinToString(separator = " ")}") "--to=docx",
// }.inheritIO().start().waitFor() "--output=${docxPath.absolute()}",
// } "${source.absolute()}",
).also {
logger.info("Running pandoc: ${it.command().joinToString(separator = " ")}")
}.directory(source.parent.toFile()).inheritIO().start().waitFor()
CopyActionResult.CONTINUE
} else {
source.copyToIgnoringExistingDirectory(target, false)
}
}
} }

View File

@ -1,4 +1,4 @@
package ru.mipt.npm.space.documentextractor package center.sciprog.space.documentextractor
import io.ktor.client.engine.cio.CIO import io.ktor.client.engine.cio.CIO
import kotlinx.cli.ArgParser import kotlinx.cli.ArgParser
@ -16,39 +16,43 @@ import space.jetbrains.api.runtime.types.ProjectIdentifier
import java.nio.file.Files import java.nio.file.Files
import java.nio.file.Path import java.nio.file.Path
import kotlin.io.path.Path import kotlin.io.path.Path
import kotlin.io.path.createDirectories
internal val urlRegex =
"""(?<spaceUrl>https?:\/\/[^\/]*)\/p\/(?<projectName>[^\/]*)\/.*-(?<folderId>.*)${'$'}""".toRegex()
suspend fun main(args: Array<String>) { suspend fun main(args: Array<String>) {
val parser = ArgParser("space-document-extractor") val parser = ArgParser("space-document-extractor")
val spaceUrl by parser.option( val url by parser.option(
ArgType.String, ArgType.String,
description = "Url of the space instance like 'https://mipt-npm.jetbrains.space'" description = "Url of the folder like 'https://spc.jetbrains.space/p/mipt-npm/documents/folders?f=SPC-qn7al1VorKp' or 'https://spc.jetbrains.space/p/mipt-npm/documents/SPC/f/SPC-qn7al1VorKp?f=SPC-qn7al1VorKp'"
).required()
val project by parser.option(
ArgType.String,
description = "The key of the exported project"
).required() ).required()
val path: String? by parser.option( val path: String? by parser.option(
ArgType.String, ArgType.String,
description = "Target directory. Default is './output/project-key'." description = "Target directory. Default is './markdown/<id>'."
) )
val folderId: String? by parser.option( val html by parser.option(
ArgType.String,
description = "FolderId for the folder to export. By default uses project root."
)
val convert by parser.option(
ArgType.Boolean, ArgType.Boolean,
description = "If defined, convert result to HTML and DOCX on download" description = "Convert Markdown to HTML via pandoc"
).default(false) ).default(false)
val convertOutputPath by parser.option( val htmlPath by parser.option(
ArgType.String, ArgType.String,
description = "Path for html and docx output directory sibling to 'output' directory" description = "Path for html output. Default is './html/<id>"
).default("converted") )
val docx by parser.option(
ArgType.Boolean,
description = "Convert Markdown to DOCX via pandoc"
).default(false)
val docxPath by parser.option(
ArgType.String,
description = "Path for docx output. Default is './docx/<id>"
)
val clientId by parser.option( val clientId by parser.option(
ArgType.String, ArgType.String,
@ -62,29 +66,49 @@ suspend fun main(args: Array<String>) {
parser.parse(args) parser.parse(args)
val target: Path = path?.let { Path(it) } ?: folderId?.let { Path("output") } ?: Path("output/$project") val urlMatch = urlRegex.matchEntire(url) ?: error("Url $url does not match space document url pattern")
Files.createDirectories(target) val spaceUrl = urlMatch.groups["spaceUrl"]?.value ?: error("Space Url token not recognized")
val space: SpaceClient = SpaceClient( val project = urlMatch.groups["projectName"]?.value ?: error("Project name token not recognized")
ktorClientForSpace(CIO),
SpaceAppInstance( val folderId = urlMatch.groups["folderId"]?.value ?: error("Folder or document token not recognized")
val markdownPath: Path = path?.let { Path(it) } ?: Path("markdown/$folderId")
Files.createDirectories(markdownPath)
val appInstance = SpaceAppInstance(
clientId ?: System.getProperty("space.clientId"), clientId ?: System.getProperty("space.clientId"),
clientSecret ?: System.getProperty("space.clientSecret"), clientSecret ?: System.getProperty("space.clientSecret"),
spaceUrl spaceUrl
), )
val spaceClient: SpaceClient = SpaceClient(
ktorClientForSpace(CIO),
appInstance,
SpaceAuth.ClientCredentials() SpaceAuth.ClientCredentials()
) )
coroutineScope { coroutineScope {
println("Processing project \"${space.projects.getProject(ProjectIdentifier.Key(project)).name}\"") println("Processing project \"${spaceClient.projects.getProject(ProjectIdentifier.Key(project)).name}\"")
space.downloadAndProcessDocumentsInProject( spaceClient.downloadAndProcessDocumentsInProject(
target, markdownPath,
ProjectIdentifier.Key(project), ProjectIdentifier.Key(project),
folderId?.let { FolderIdentifier.Id(it) } ?: FolderIdentifier.Root FolderIdentifier.Id(folderId)
) )
if (convert) { if (html) {
convert(target, target.resolveSibling(convertOutputPath)) val htmlTargetPath = path?.let { Path(it) }?.resolve(htmlPath ?: "html")
?: Path(htmlPath ?: "html/$folderId")
htmlTargetPath.createDirectories()
convertToHtml(markdownPath, htmlTargetPath)
}
if (docx) {
val docxTargetPath = path?.let { Path(it) }?.resolve(docxPath ?: "docx")
?: Path(docxPath ?: "docx/$folderId")
docxTargetPath.createDirectories()
convertToDocX(markdownPath, docxTargetPath)
} }
} }
} }

View File

@ -1,4 +1,4 @@
package ru.mipt.npm.space.documentextractor package center.sciprog.space.documentextractor
import io.ktor.client.request.header import io.ktor.client.request.header
import io.ktor.client.request.request import io.ktor.client.request.request
@ -67,7 +67,7 @@ internal suspend fun SpaceClient.extractFile(
private val regex = """!\[(?<alt>.*)]\(/d/(?<id>.*)\?f=0""".toRegex() private val regex = """!\[(?<alt>.*)]\(/d/(?<id>.*)\?f=0""".toRegex()
/** /**
* Post-process a markdown document by downloading images and replacing links * Post-process a Markdown document by downloading images and replacing links
*/ */
internal suspend fun SpaceClient.processMarkdownDocument(path: Path) = coroutineScope { internal suspend fun SpaceClient.processMarkdownDocument(path: Path) = coroutineScope {
val documentBody = path.readText() val documentBody = path.readText()
@ -87,7 +87,11 @@ internal suspend fun SpaceClient.processMarkdownDocument(path: Path) = coroutine
} }
/** /**
* Download images for markdown documents in the directory * Download images for Markdown documents in the directory
*
* Images are always stored in the same directory as files themselves
*
* @param recursive turn recursive mode on or off
*/ */
internal suspend fun SpaceClient.processMarkdownInDirectory( internal suspend fun SpaceClient.processMarkdownInDirectory(
path: Path, path: Path,
@ -162,6 +166,8 @@ internal suspend fun SpaceClient.downloadDocumentFolder(
/** /**
* Download all documents in a project or a folder with given [rootFolder] and postprocess files * Download all documents in a project or a folder with given [rootFolder] and postprocess files
*
* @param directory target directory
*/ */
suspend fun SpaceClient.downloadAndProcessDocumentsInProject( suspend fun SpaceClient.downloadAndProcessDocumentsInProject(
directory: Path, directory: Path,

76
src/main/kotlin/server.kt Normal file
View File

@ -0,0 +1,76 @@
package center.sciprog.space.documentextractor
//
//import io.ktor.http.HttpStatusCode
//import io.ktor.server.application.Application
//import io.ktor.server.application.call
//import io.ktor.server.cio.CIO
//import io.ktor.server.engine.embeddedServer
//import io.ktor.server.request.*
//import io.ktor.server.response.respond
//import io.ktor.server.routing.*
//import space.jetbrains.api.runtime.SpaceAppInstance
//import space.jetbrains.api.runtime.SpaceAuth
//import space.jetbrains.api.runtime.SpaceClient
//import space.jetbrains.api.runtime.helpers.readPayload
//import space.jetbrains.api.runtime.helpers.verifyWithPublicKey
//import space.jetbrains.api.runtime.ktorClientForSpace
//import space.jetbrains.api.runtime.types.ListCommandsPayload
//import space.jetbrains.api.runtime.types.MessagePayload
//
//fun Application.configureRouting(spaceClient: SpaceClient) {
// val appInstance = SpaceAppInstance(
// environment.config.property("space.clientId"),
//
// clientSecret ?: System.getProperty("space.clientSecret"),
// spaceUrl
// )
//
//
// val spaceClient: SpaceClient = SpaceClient(
// ktorClientForSpace(io.ktor.client.engine.cio.CIO),
// appInstance,
// SpaceAuth.ClientCredentials()
// )
//
// routing {
// post("api/space") {
// // read request body
// val body = call.receiveText()
//
// // read headers required for Space verification
// val signature = call.request.header("X-Space-Public-Key-Signature")
// val timestamp = call.request.header("X-Space-Timestamp")?.toLongOrNull()
// // verifyWithPublicKey gets a key from Space, uses it to generate message hash
// // and compares the generated hash to the hash in a message
// if (signature.isNullOrBlank() || timestamp == null || !spaceClient.verifyWithPublicKey(
// body, timestamp, signature
// )
// ) {
// call.respond(HttpStatusCode.Unauthorized)
// return@post
// }
//
// // analyze the message payload
// // MessagePayload = user sends a command
// // ListCommandsPayload = user types a slash or a char
// when (val payload = readPayload(body)) {
// is MessagePayload -> {
// runHelpCommand(payload)
// call.respond(HttpStatusCode.OK, "")
// }
//
// is ListCommandsPayload -> {
//
// }
// }
// }
// }
//}
//
//fun main() {
// embeddedServer(CIO, port = 8080) {
// val
//
// configureRouting()
// }.start(wait = true)
//}

View File

@ -1,4 +1,3 @@
# links-to-html.lua
function Link(el) function Link(el)
el.target = string.gsub(el.target, "%.md", ".html") el.target = string.gsub(el.target, "%.md", ".html")
return el return el