Fix document export for non-markdown and inline image processing.

This commit is contained in:
Alexander Nozik 2023-09-03 19:54:21 +03:00
parent 5f601cbd72
commit 46e3fb966c
3 changed files with 59 additions and 79 deletions

View File

@ -3,11 +3,9 @@ package center.sciprog.space.documentextractor
import io.ktor.client.request.header
import io.ktor.client.request.request
import io.ktor.client.request.url
import io.ktor.client.statement.bodyAsChannel
import io.ktor.client.statement.readBytes
import io.ktor.http.HttpHeaders
import io.ktor.http.HttpMethod
import io.ktor.utils.io.jvm.javaio.copyTo
import kotlinx.coroutines.Dispatchers
import kotlinx.coroutines.coroutineScope
import kotlinx.coroutines.launch
@ -17,21 +15,21 @@ import space.jetbrains.api.runtime.Batch
import space.jetbrains.api.runtime.SpaceClient
import space.jetbrains.api.runtime.resources.projects
import space.jetbrains.api.runtime.types.*
import java.nio.file.Files
import java.nio.file.Path
import kotlin.io.path.*
import kotlin.streams.toList
import kotlin.io.path.createDirectories
import kotlin.io.path.writeBytes
import kotlin.io.path.writeText
internal val logger by lazy { LoggerFactory.getLogger("space-extractor") }
/**
* Extract single attachment image
*/
internal suspend fun SpaceClient.extractImage(
internal suspend fun SpaceClient.extractAttachment(
imageFile: Path,
imageId: String,
) {
logger.info("Downloading image file to $imageFile")
logger.info("Downloading attachment file to $imageFile")
val response = ktorClient.request {
url("${server.serverUrl}/d/$imageId")
method = HttpMethod.Get
@ -54,57 +52,10 @@ internal suspend fun SpaceClient.extractFile(
method = HttpMethod.Get
header(HttpHeaders.Authorization, "Bearer ${token().accessToken}")
}
documentFile.outputStream().use {
response.bodyAsChannel().copyTo(it)
}
documentFile.writeBytes(response.readBytes())
}
private val regex = """!\[(?<alt>.*)]\(/d/(?<id>.*)\?f=0""".toRegex()
/**
* Post-process a Markdown document by downloading images and replacing links
*/
internal suspend fun SpaceClient.processMarkdownDocument(path: Path) = coroutineScope {
val documentBody = path.readText()
val logger = LoggerFactory.getLogger("space-document-extractor")
logger.info("Processing file $path...")
val imageDirectory = path.parent.resolve("images")
imageDirectory.createDirectories()
val newText = documentBody.replace(regex) {
val id = it.groups["id"]?.value ?: error("Unexpected reference format: ${it.value}")
val alt = it.groups["alt"]?.value
val imageName = alt?.let { "$id-$alt" } ?: id
val imageFile = imageDirectory.resolve(imageName)
logger.info("Downloading image $id as $imageFile")
launch(Dispatchers.IO) {
extractImage(imageFile, id)
}
"![$alt](images/$imageName"
}
path.writeText(newText)
}
/**
* Download images for Markdown documents in the directory
*
* Images are always stored in the same directory as files themselves
*
* @param recursive turn recursive mode on or off
*/
internal suspend fun SpaceClient.processMarkdownInDirectory(
path: Path,
fileExtension: String = ".md",
recursive: Boolean = true,
) {
Files.list(path).toList().forEach {
if (it.toString().endsWith(fileExtension)) {
logger.info("Updating links in a markdown $it")
processMarkdownDocument(it)
} else if (recursive && it.isDirectory()) {
processMarkdownInDirectory(it, fileExtension)
}
}
}
private val imageRegex = """!\[(?<alt>.*)]\(/d/(?<id>.*)\?f=0( "(?<name>.*)")?\)""".toRegex()
/**
* Download single Space document
@ -118,7 +69,7 @@ internal suspend fun SpaceClient.downloadDocument(
launch(Dispatchers.IO) {
val filePath = try {
directory.resolve(document.title)
} catch (ex: Exception){
} catch (ex: Exception) {
directory.resolve(document.id)
}
extractFile(filePath, document.id)
@ -131,19 +82,30 @@ internal suspend fun SpaceClient.downloadDocument(
} catch (ex: Exception) {
directory.resolve(document.id + ".md")
}
val content = body.docContent
if (content is MdTextDocumentContent) {
markdownFilePath.writeText(content.markdown, Charsets.UTF_8)
} else {
when (val content = body.docContent) {
is MdTextDocumentContent -> {
val imageDirectory = directory.resolve("images")
imageDirectory.createDirectories()
val newText = content.markdown.replace(imageRegex) {
val id = it.groups["id"]?.value ?: error("Unexpected reference format: ${it.value}")
val alt = it.groups["alt"]?.value?.ifBlank { null }
val name = it.groups["name"]?.value?.ifBlank { null }
val imageName = name?.let { "$id-$name" } ?: alt?.let { "$id-$alt" } ?: id
val imageFile = imageDirectory.resolve(imageName)
logger.info("Downloading image $id as $imageFile")
launch(Dispatchers.IO) {
val filePath = try {
directory.resolve(document.title)
} catch (ex: Exception){
directory.resolve(document.id)
extractAttachment(imageFile, id)
}
extractFile(filePath, document.id)
"![${alt?:""}](images/$imageName)"
}
markdownFilePath.writeText(newText, Charsets.UTF_8)
}
else -> {
logger.error("Rich text documents are unsupported (${document.title})")
}
}
}
else -> {
@ -168,13 +130,30 @@ internal suspend fun SpaceClient.downloadDocumentFolder(
}
documents.data.forEach {
val document = projects.documents.getDocument(projectId, it.id) {
id()
title()
body()
bodyType()
title()
id()
}
val bodyInfo = document.body
if ((bodyInfo is TextDocumentHttpBody) && bodyInfo.docContent !is MdTextDocumentContent) {
//make a conversion to markdown before downloading
logger.info("Converting document ${document.title} in ${folderId.compactId} to markdown format")
val convertedDocument = projects.documents.updateDocument(
project = projectId,
documentId = it.id,
updateIn = TextDocumentBodyConvertTypeIn(
type = DraftDocumentType.MARKDOWN
)
) {
body()
title()
id()
}
downloadDocument(directory, convertedDocument)
} else {
downloadDocument(directory, document)
}
}
val subFolders: Batch<DocumentFolder> = projects.documents.folders.subfolders.listSubfolders(projectId, folderId)
subFolders.data.forEach {
@ -195,5 +174,5 @@ suspend fun SpaceClient.downloadAndProcessDocumentsInProject(
) = withContext(Dispatchers.IO) {
logger.info("Processing project ${projectId.compactId} to $directory")
downloadDocumentFolder(directory, projectId, rootFolder)
processMarkdownInDirectory(directory)
// processMarkdownInDirectory(directory)
}

View File

@ -57,7 +57,7 @@ private suspend fun SpaceClient.writeMessages(
val fileId = attachment.id
val name = "${attachment.id}-${attachment.filename}"
val file = attachmentsDirectory.resolve(name)
extractFile(file, fileId)
extractAttachment(file, fileId)
writer.appendLine("*Attachment*: [$name](attachments/$name)\n")
}
@ -65,7 +65,7 @@ private suspend fun SpaceClient.writeMessages(
val fileId = attachment.id
val name = attachment.name?.let { "${attachment.id}-${attachment.name}"} ?: fileId
val file = attachmentsDirectory.resolve(name)
extractFile(file, fileId)
extractAttachment(file, fileId)
writer.appendLine("*Attachment*: [$name](attachments/$name)\n")
}
@ -73,7 +73,7 @@ private suspend fun SpaceClient.writeMessages(
val fileId = attachment.id
val name = attachment.name?.let { "${attachment.id}-${attachment.name}"} ?: fileId
val file = attachmentsDirectory.resolve(name)
extractFile(file, fileId)
extractAttachment(file, fileId)
writer.appendLine("*Attachment*: [$name](attachments/$name)\n")
}
}
@ -104,6 +104,7 @@ suspend fun SpaceClient.extractMessages(
name()
username()
}
name()
}
contact{
key()

View File

@ -83,7 +83,7 @@ private class ExtractDocumentsCommand : ExtractCommand("docs", "Extract document
val folderId: String? = urlMatch.groups["folderId"]?.value
val markdownPath: Path = path?.let { Path(it) } ?: Path("markdown/$folderId")
val markdownPath: Path = path?.let { Path(it) } ?: Path("markdown/${folderId ?: project}")
Files.createDirectories(markdownPath)
@ -108,13 +108,13 @@ private class ExtractDocumentsCommand : ExtractCommand("docs", "Extract document
)
if (html) {
val htmlTargetPath = path?.let { Path(it) }?.resolve(htmlPath ?: "html")
?: Path(htmlPath ?: "html/$folderId")
?: Path(htmlPath ?: "html/${folderId ?: project}")
htmlTargetPath.createDirectories()
convertToHtml(markdownPath, htmlTargetPath)
}
if (docx) {
val docxTargetPath = path?.let { Path(it) }?.resolve(docxPath ?: "docx")
?: Path(docxPath ?: "docx/$folderId")
?: Path(docxPath ?: "docx/${folderId ?: project}")
docxTargetPath.createDirectories()
convertToDocX(markdownPath, docxTargetPath)
}
@ -123,7 +123,7 @@ private class ExtractDocumentsCommand : ExtractCommand("docs", "Extract document
companion object {
private val urlRegex =
"""(?<spaceUrl>https?:\/\/[^\/]*)\/p\/(?<projectName>[^\/]*)\/.*-(?<folderId>.*)${'$'}""".toRegex()
"""(?<spaceUrl>https?:\/\/[^\/]*)\/p\/(?<projectName>[^\/]*)(\/.*-(?<folderId>.*)${'$'})?""".toRegex()
}
}