Fix document export for non-markdown and inline image processing.

This commit is contained in:
Alexander Nozik 2023-09-03 19:54:21 +03:00
parent 5f601cbd72
commit 46e3fb966c
3 changed files with 59 additions and 79 deletions

View File

@ -3,11 +3,9 @@ package center.sciprog.space.documentextractor
import io.ktor.client.request.header import io.ktor.client.request.header
import io.ktor.client.request.request import io.ktor.client.request.request
import io.ktor.client.request.url import io.ktor.client.request.url
import io.ktor.client.statement.bodyAsChannel
import io.ktor.client.statement.readBytes import io.ktor.client.statement.readBytes
import io.ktor.http.HttpHeaders import io.ktor.http.HttpHeaders
import io.ktor.http.HttpMethod import io.ktor.http.HttpMethod
import io.ktor.utils.io.jvm.javaio.copyTo
import kotlinx.coroutines.Dispatchers import kotlinx.coroutines.Dispatchers
import kotlinx.coroutines.coroutineScope import kotlinx.coroutines.coroutineScope
import kotlinx.coroutines.launch import kotlinx.coroutines.launch
@ -17,21 +15,21 @@ import space.jetbrains.api.runtime.Batch
import space.jetbrains.api.runtime.SpaceClient import space.jetbrains.api.runtime.SpaceClient
import space.jetbrains.api.runtime.resources.projects import space.jetbrains.api.runtime.resources.projects
import space.jetbrains.api.runtime.types.* import space.jetbrains.api.runtime.types.*
import java.nio.file.Files
import java.nio.file.Path import java.nio.file.Path
import kotlin.io.path.* import kotlin.io.path.createDirectories
import kotlin.streams.toList import kotlin.io.path.writeBytes
import kotlin.io.path.writeText
internal val logger by lazy { LoggerFactory.getLogger("space-extractor") } internal val logger by lazy { LoggerFactory.getLogger("space-extractor") }
/** /**
* Extract single attachment image * Extract single attachment image
*/ */
internal suspend fun SpaceClient.extractImage( internal suspend fun SpaceClient.extractAttachment(
imageFile: Path, imageFile: Path,
imageId: String, imageId: String,
) { ) {
logger.info("Downloading image file to $imageFile") logger.info("Downloading attachment file to $imageFile")
val response = ktorClient.request { val response = ktorClient.request {
url("${server.serverUrl}/d/$imageId") url("${server.serverUrl}/d/$imageId")
method = HttpMethod.Get method = HttpMethod.Get
@ -54,57 +52,10 @@ internal suspend fun SpaceClient.extractFile(
method = HttpMethod.Get method = HttpMethod.Get
header(HttpHeaders.Authorization, "Bearer ${token().accessToken}") header(HttpHeaders.Authorization, "Bearer ${token().accessToken}")
} }
documentFile.outputStream().use { documentFile.writeBytes(response.readBytes())
response.bodyAsChannel().copyTo(it)
}
} }
private val regex = """!\[(?<alt>.*)]\(/d/(?<id>.*)\?f=0""".toRegex() private val imageRegex = """!\[(?<alt>.*)]\(/d/(?<id>.*)\?f=0( "(?<name>.*)")?\)""".toRegex()
/**
* Post-process a Markdown document by downloading images and replacing links
*/
internal suspend fun SpaceClient.processMarkdownDocument(path: Path) = coroutineScope {
val documentBody = path.readText()
val logger = LoggerFactory.getLogger("space-document-extractor")
logger.info("Processing file $path...")
val imageDirectory = path.parent.resolve("images")
imageDirectory.createDirectories()
val newText = documentBody.replace(regex) {
val id = it.groups["id"]?.value ?: error("Unexpected reference format: ${it.value}")
val alt = it.groups["alt"]?.value
val imageName = alt?.let { "$id-$alt" } ?: id
val imageFile = imageDirectory.resolve(imageName)
logger.info("Downloading image $id as $imageFile")
launch(Dispatchers.IO) {
extractImage(imageFile, id)
}
"![$alt](images/$imageName"
}
path.writeText(newText)
}
/**
* Download images for Markdown documents in the directory
*
* Images are always stored in the same directory as files themselves
*
* @param recursive turn recursive mode on or off
*/
internal suspend fun SpaceClient.processMarkdownInDirectory(
path: Path,
fileExtension: String = ".md",
recursive: Boolean = true,
) {
Files.list(path).toList().forEach {
if (it.toString().endsWith(fileExtension)) {
logger.info("Updating links in a markdown $it")
processMarkdownDocument(it)
} else if (recursive && it.isDirectory()) {
processMarkdownInDirectory(it, fileExtension)
}
}
}
/** /**
* Download single Space document * Download single Space document
@ -118,7 +69,7 @@ internal suspend fun SpaceClient.downloadDocument(
launch(Dispatchers.IO) { launch(Dispatchers.IO) {
val filePath = try { val filePath = try {
directory.resolve(document.title) directory.resolve(document.title)
} catch (ex: Exception){ } catch (ex: Exception) {
directory.resolve(document.id) directory.resolve(document.id)
} }
extractFile(filePath, document.id) extractFile(filePath, document.id)
@ -131,19 +82,30 @@ internal suspend fun SpaceClient.downloadDocument(
} catch (ex: Exception) { } catch (ex: Exception) {
directory.resolve(document.id + ".md") directory.resolve(document.id + ".md")
} }
val content = body.docContent when (val content = body.docContent) {
if (content is MdTextDocumentContent) { is MdTextDocumentContent -> {
markdownFilePath.writeText(content.markdown, Charsets.UTF_8) val imageDirectory = directory.resolve("images")
} else { imageDirectory.createDirectories()
val newText = content.markdown.replace(imageRegex) {
val id = it.groups["id"]?.value ?: error("Unexpected reference format: ${it.value}")
val alt = it.groups["alt"]?.value?.ifBlank { null }
val name = it.groups["name"]?.value?.ifBlank { null }
val imageName = name?.let { "$id-$name" } ?: alt?.let { "$id-$alt" } ?: id
val imageFile = imageDirectory.resolve(imageName)
logger.info("Downloading image $id as $imageFile")
launch(Dispatchers.IO) { launch(Dispatchers.IO) {
val filePath = try { extractAttachment(imageFile, id)
directory.resolve(document.title)
} catch (ex: Exception){
directory.resolve(document.id)
} }
extractFile(filePath, document.id) "![${alt?:""}](images/$imageName)"
}
markdownFilePath.writeText(newText, Charsets.UTF_8)
}
else -> {
logger.error("Rich text documents are unsupported (${document.title})")
} }
} }
} }
else -> { else -> {
@ -168,13 +130,30 @@ internal suspend fun SpaceClient.downloadDocumentFolder(
} }
documents.data.forEach { documents.data.forEach {
val document = projects.documents.getDocument(projectId, it.id) { val document = projects.documents.getDocument(projectId, it.id) {
id()
title()
body() body()
bodyType() title()
id()
} }
val bodyInfo = document.body
if ((bodyInfo is TextDocumentHttpBody) && bodyInfo.docContent !is MdTextDocumentContent) {
//make a conversion to markdown before downloading
logger.info("Converting document ${document.title} in ${folderId.compactId} to markdown format")
val convertedDocument = projects.documents.updateDocument(
project = projectId,
documentId = it.id,
updateIn = TextDocumentBodyConvertTypeIn(
type = DraftDocumentType.MARKDOWN
)
) {
body()
title()
id()
}
downloadDocument(directory, convertedDocument)
} else {
downloadDocument(directory, document) downloadDocument(directory, document)
} }
}
val subFolders: Batch<DocumentFolder> = projects.documents.folders.subfolders.listSubfolders(projectId, folderId) val subFolders: Batch<DocumentFolder> = projects.documents.folders.subfolders.listSubfolders(projectId, folderId)
subFolders.data.forEach { subFolders.data.forEach {
@ -195,5 +174,5 @@ suspend fun SpaceClient.downloadAndProcessDocumentsInProject(
) = withContext(Dispatchers.IO) { ) = withContext(Dispatchers.IO) {
logger.info("Processing project ${projectId.compactId} to $directory") logger.info("Processing project ${projectId.compactId} to $directory")
downloadDocumentFolder(directory, projectId, rootFolder) downloadDocumentFolder(directory, projectId, rootFolder)
processMarkdownInDirectory(directory) // processMarkdownInDirectory(directory)
} }

View File

@ -57,7 +57,7 @@ private suspend fun SpaceClient.writeMessages(
val fileId = attachment.id val fileId = attachment.id
val name = "${attachment.id}-${attachment.filename}" val name = "${attachment.id}-${attachment.filename}"
val file = attachmentsDirectory.resolve(name) val file = attachmentsDirectory.resolve(name)
extractFile(file, fileId) extractAttachment(file, fileId)
writer.appendLine("*Attachment*: [$name](attachments/$name)\n") writer.appendLine("*Attachment*: [$name](attachments/$name)\n")
} }
@ -65,7 +65,7 @@ private suspend fun SpaceClient.writeMessages(
val fileId = attachment.id val fileId = attachment.id
val name = attachment.name?.let { "${attachment.id}-${attachment.name}"} ?: fileId val name = attachment.name?.let { "${attachment.id}-${attachment.name}"} ?: fileId
val file = attachmentsDirectory.resolve(name) val file = attachmentsDirectory.resolve(name)
extractFile(file, fileId) extractAttachment(file, fileId)
writer.appendLine("*Attachment*: [$name](attachments/$name)\n") writer.appendLine("*Attachment*: [$name](attachments/$name)\n")
} }
@ -73,7 +73,7 @@ private suspend fun SpaceClient.writeMessages(
val fileId = attachment.id val fileId = attachment.id
val name = attachment.name?.let { "${attachment.id}-${attachment.name}"} ?: fileId val name = attachment.name?.let { "${attachment.id}-${attachment.name}"} ?: fileId
val file = attachmentsDirectory.resolve(name) val file = attachmentsDirectory.resolve(name)
extractFile(file, fileId) extractAttachment(file, fileId)
writer.appendLine("*Attachment*: [$name](attachments/$name)\n") writer.appendLine("*Attachment*: [$name](attachments/$name)\n")
} }
} }
@ -104,6 +104,7 @@ suspend fun SpaceClient.extractMessages(
name() name()
username() username()
} }
name()
} }
contact{ contact{
key() key()

View File

@ -83,7 +83,7 @@ private class ExtractDocumentsCommand : ExtractCommand("docs", "Extract document
val folderId: String? = urlMatch.groups["folderId"]?.value val folderId: String? = urlMatch.groups["folderId"]?.value
val markdownPath: Path = path?.let { Path(it) } ?: Path("markdown/$folderId") val markdownPath: Path = path?.let { Path(it) } ?: Path("markdown/${folderId ?: project}")
Files.createDirectories(markdownPath) Files.createDirectories(markdownPath)
@ -108,13 +108,13 @@ private class ExtractDocumentsCommand : ExtractCommand("docs", "Extract document
) )
if (html) { if (html) {
val htmlTargetPath = path?.let { Path(it) }?.resolve(htmlPath ?: "html") val htmlTargetPath = path?.let { Path(it) }?.resolve(htmlPath ?: "html")
?: Path(htmlPath ?: "html/$folderId") ?: Path(htmlPath ?: "html/${folderId ?: project}")
htmlTargetPath.createDirectories() htmlTargetPath.createDirectories()
convertToHtml(markdownPath, htmlTargetPath) convertToHtml(markdownPath, htmlTargetPath)
} }
if (docx) { if (docx) {
val docxTargetPath = path?.let { Path(it) }?.resolve(docxPath ?: "docx") val docxTargetPath = path?.let { Path(it) }?.resolve(docxPath ?: "docx")
?: Path(docxPath ?: "docx/$folderId") ?: Path(docxPath ?: "docx/${folderId ?: project}")
docxTargetPath.createDirectories() docxTargetPath.createDirectories()
convertToDocX(markdownPath, docxTargetPath) convertToDocX(markdownPath, docxTargetPath)
} }
@ -123,7 +123,7 @@ private class ExtractDocumentsCommand : ExtractCommand("docs", "Extract document
companion object { companion object {
private val urlRegex = private val urlRegex =
"""(?<spaceUrl>https?:\/\/[^\/]*)\/p\/(?<projectName>[^\/]*)\/.*-(?<folderId>.*)${'$'}""".toRegex() """(?<spaceUrl>https?:\/\/[^\/]*)\/p\/(?<projectName>[^\/]*)(\/.*-(?<folderId>.*)${'$'})?""".toRegex()
} }
} }