Fix document export for non-markdown and inline image processing.
This commit is contained in:
parent
5f601cbd72
commit
46e3fb966c
@ -3,11 +3,9 @@ package center.sciprog.space.documentextractor
|
||||
import io.ktor.client.request.header
|
||||
import io.ktor.client.request.request
|
||||
import io.ktor.client.request.url
|
||||
import io.ktor.client.statement.bodyAsChannel
|
||||
import io.ktor.client.statement.readBytes
|
||||
import io.ktor.http.HttpHeaders
|
||||
import io.ktor.http.HttpMethod
|
||||
import io.ktor.utils.io.jvm.javaio.copyTo
|
||||
import kotlinx.coroutines.Dispatchers
|
||||
import kotlinx.coroutines.coroutineScope
|
||||
import kotlinx.coroutines.launch
|
||||
@ -17,21 +15,21 @@ import space.jetbrains.api.runtime.Batch
|
||||
import space.jetbrains.api.runtime.SpaceClient
|
||||
import space.jetbrains.api.runtime.resources.projects
|
||||
import space.jetbrains.api.runtime.types.*
|
||||
import java.nio.file.Files
|
||||
import java.nio.file.Path
|
||||
import kotlin.io.path.*
|
||||
import kotlin.streams.toList
|
||||
import kotlin.io.path.createDirectories
|
||||
import kotlin.io.path.writeBytes
|
||||
import kotlin.io.path.writeText
|
||||
|
||||
internal val logger by lazy { LoggerFactory.getLogger("space-extractor") }
|
||||
|
||||
/**
|
||||
* Extract single attachment image
|
||||
*/
|
||||
internal suspend fun SpaceClient.extractImage(
|
||||
internal suspend fun SpaceClient.extractAttachment(
|
||||
imageFile: Path,
|
||||
imageId: String,
|
||||
) {
|
||||
logger.info("Downloading image file to $imageFile")
|
||||
logger.info("Downloading attachment file to $imageFile")
|
||||
val response = ktorClient.request {
|
||||
url("${server.serverUrl}/d/$imageId")
|
||||
method = HttpMethod.Get
|
||||
@ -54,57 +52,10 @@ internal suspend fun SpaceClient.extractFile(
|
||||
method = HttpMethod.Get
|
||||
header(HttpHeaders.Authorization, "Bearer ${token().accessToken}")
|
||||
}
|
||||
documentFile.outputStream().use {
|
||||
response.bodyAsChannel().copyTo(it)
|
||||
}
|
||||
documentFile.writeBytes(response.readBytes())
|
||||
}
|
||||
|
||||
private val regex = """!\[(?<alt>.*)]\(/d/(?<id>.*)\?f=0""".toRegex()
|
||||
|
||||
/**
|
||||
* Post-process a Markdown document by downloading images and replacing links
|
||||
*/
|
||||
internal suspend fun SpaceClient.processMarkdownDocument(path: Path) = coroutineScope {
|
||||
val documentBody = path.readText()
|
||||
val logger = LoggerFactory.getLogger("space-document-extractor")
|
||||
logger.info("Processing file $path...")
|
||||
val imageDirectory = path.parent.resolve("images")
|
||||
imageDirectory.createDirectories()
|
||||
val newText = documentBody.replace(regex) {
|
||||
val id = it.groups["id"]?.value ?: error("Unexpected reference format: ${it.value}")
|
||||
val alt = it.groups["alt"]?.value
|
||||
val imageName = alt?.let { "$id-$alt" } ?: id
|
||||
val imageFile = imageDirectory.resolve(imageName)
|
||||
logger.info("Downloading image $id as $imageFile")
|
||||
launch(Dispatchers.IO) {
|
||||
extractImage(imageFile, id)
|
||||
}
|
||||
"![$alt](images/$imageName"
|
||||
}
|
||||
path.writeText(newText)
|
||||
}
|
||||
|
||||
/**
|
||||
* Download images for Markdown documents in the directory
|
||||
*
|
||||
* Images are always stored in the same directory as files themselves
|
||||
*
|
||||
* @param recursive turn recursive mode on or off
|
||||
*/
|
||||
internal suspend fun SpaceClient.processMarkdownInDirectory(
|
||||
path: Path,
|
||||
fileExtension: String = ".md",
|
||||
recursive: Boolean = true,
|
||||
) {
|
||||
Files.list(path).toList().forEach {
|
||||
if (it.toString().endsWith(fileExtension)) {
|
||||
logger.info("Updating links in a markdown $it")
|
||||
processMarkdownDocument(it)
|
||||
} else if (recursive && it.isDirectory()) {
|
||||
processMarkdownInDirectory(it, fileExtension)
|
||||
}
|
||||
}
|
||||
}
|
||||
private val imageRegex = """!\[(?<alt>.*)]\(/d/(?<id>.*)\?f=0( "(?<name>.*)")?\)""".toRegex()
|
||||
|
||||
/**
|
||||
* Download single Space document
|
||||
@ -118,7 +69,7 @@ internal suspend fun SpaceClient.downloadDocument(
|
||||
launch(Dispatchers.IO) {
|
||||
val filePath = try {
|
||||
directory.resolve(document.title)
|
||||
} catch (ex: Exception){
|
||||
} catch (ex: Exception) {
|
||||
directory.resolve(document.id)
|
||||
}
|
||||
extractFile(filePath, document.id)
|
||||
@ -131,19 +82,30 @@ internal suspend fun SpaceClient.downloadDocument(
|
||||
} catch (ex: Exception) {
|
||||
directory.resolve(document.id + ".md")
|
||||
}
|
||||
val content = body.docContent
|
||||
if (content is MdTextDocumentContent) {
|
||||
markdownFilePath.writeText(content.markdown, Charsets.UTF_8)
|
||||
} else {
|
||||
launch(Dispatchers.IO) {
|
||||
val filePath = try {
|
||||
directory.resolve(document.title)
|
||||
} catch (ex: Exception){
|
||||
directory.resolve(document.id)
|
||||
when (val content = body.docContent) {
|
||||
is MdTextDocumentContent -> {
|
||||
val imageDirectory = directory.resolve("images")
|
||||
imageDirectory.createDirectories()
|
||||
val newText = content.markdown.replace(imageRegex) {
|
||||
val id = it.groups["id"]?.value ?: error("Unexpected reference format: ${it.value}")
|
||||
val alt = it.groups["alt"]?.value?.ifBlank { null }
|
||||
val name = it.groups["name"]?.value?.ifBlank { null }
|
||||
val imageName = name?.let { "$id-$name" } ?: alt?.let { "$id-$alt" } ?: id
|
||||
val imageFile = imageDirectory.resolve(imageName)
|
||||
logger.info("Downloading image $id as $imageFile")
|
||||
launch(Dispatchers.IO) {
|
||||
extractAttachment(imageFile, id)
|
||||
}
|
||||
"![${alt?:""}](images/$imageName)"
|
||||
}
|
||||
extractFile(filePath, document.id)
|
||||
markdownFilePath.writeText(newText, Charsets.UTF_8)
|
||||
}
|
||||
|
||||
else -> {
|
||||
logger.error("Rich text documents are unsupported (${document.title})")
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
else -> {
|
||||
@ -168,12 +130,29 @@ internal suspend fun SpaceClient.downloadDocumentFolder(
|
||||
}
|
||||
documents.data.forEach {
|
||||
val document = projects.documents.getDocument(projectId, it.id) {
|
||||
id()
|
||||
title()
|
||||
body()
|
||||
bodyType()
|
||||
title()
|
||||
id()
|
||||
}
|
||||
val bodyInfo = document.body
|
||||
if ((bodyInfo is TextDocumentHttpBody) && bodyInfo.docContent !is MdTextDocumentContent) {
|
||||
//make a conversion to markdown before downloading
|
||||
logger.info("Converting document ${document.title} in ${folderId.compactId} to markdown format")
|
||||
val convertedDocument = projects.documents.updateDocument(
|
||||
project = projectId,
|
||||
documentId = it.id,
|
||||
updateIn = TextDocumentBodyConvertTypeIn(
|
||||
type = DraftDocumentType.MARKDOWN
|
||||
)
|
||||
) {
|
||||
body()
|
||||
title()
|
||||
id()
|
||||
}
|
||||
downloadDocument(directory, convertedDocument)
|
||||
} else {
|
||||
downloadDocument(directory, document)
|
||||
}
|
||||
downloadDocument(directory, document)
|
||||
}
|
||||
|
||||
val subFolders: Batch<DocumentFolder> = projects.documents.folders.subfolders.listSubfolders(projectId, folderId)
|
||||
@ -195,5 +174,5 @@ suspend fun SpaceClient.downloadAndProcessDocumentsInProject(
|
||||
) = withContext(Dispatchers.IO) {
|
||||
logger.info("Processing project ${projectId.compactId} to $directory")
|
||||
downloadDocumentFolder(directory, projectId, rootFolder)
|
||||
processMarkdownInDirectory(directory)
|
||||
// processMarkdownInDirectory(directory)
|
||||
}
|
@ -57,7 +57,7 @@ private suspend fun SpaceClient.writeMessages(
|
||||
val fileId = attachment.id
|
||||
val name = "${attachment.id}-${attachment.filename}"
|
||||
val file = attachmentsDirectory.resolve(name)
|
||||
extractFile(file, fileId)
|
||||
extractAttachment(file, fileId)
|
||||
writer.appendLine("*Attachment*: [$name](attachments/$name)\n")
|
||||
}
|
||||
|
||||
@ -65,7 +65,7 @@ private suspend fun SpaceClient.writeMessages(
|
||||
val fileId = attachment.id
|
||||
val name = attachment.name?.let { "${attachment.id}-${attachment.name}"} ?: fileId
|
||||
val file = attachmentsDirectory.resolve(name)
|
||||
extractFile(file, fileId)
|
||||
extractAttachment(file, fileId)
|
||||
writer.appendLine("*Attachment*: [$name](attachments/$name)\n")
|
||||
}
|
||||
|
||||
@ -73,7 +73,7 @@ private suspend fun SpaceClient.writeMessages(
|
||||
val fileId = attachment.id
|
||||
val name = attachment.name?.let { "${attachment.id}-${attachment.name}"} ?: fileId
|
||||
val file = attachmentsDirectory.resolve(name)
|
||||
extractFile(file, fileId)
|
||||
extractAttachment(file, fileId)
|
||||
writer.appendLine("*Attachment*: [$name](attachments/$name)\n")
|
||||
}
|
||||
}
|
||||
@ -104,6 +104,7 @@ suspend fun SpaceClient.extractMessages(
|
||||
name()
|
||||
username()
|
||||
}
|
||||
name()
|
||||
}
|
||||
contact{
|
||||
key()
|
||||
|
@ -83,7 +83,7 @@ private class ExtractDocumentsCommand : ExtractCommand("docs", "Extract document
|
||||
|
||||
val folderId: String? = urlMatch.groups["folderId"]?.value
|
||||
|
||||
val markdownPath: Path = path?.let { Path(it) } ?: Path("markdown/$folderId")
|
||||
val markdownPath: Path = path?.let { Path(it) } ?: Path("markdown/${folderId ?: project}")
|
||||
|
||||
Files.createDirectories(markdownPath)
|
||||
|
||||
@ -108,13 +108,13 @@ private class ExtractDocumentsCommand : ExtractCommand("docs", "Extract document
|
||||
)
|
||||
if (html) {
|
||||
val htmlTargetPath = path?.let { Path(it) }?.resolve(htmlPath ?: "html")
|
||||
?: Path(htmlPath ?: "html/$folderId")
|
||||
?: Path(htmlPath ?: "html/${folderId ?: project}")
|
||||
htmlTargetPath.createDirectories()
|
||||
convertToHtml(markdownPath, htmlTargetPath)
|
||||
}
|
||||
if (docx) {
|
||||
val docxTargetPath = path?.let { Path(it) }?.resolve(docxPath ?: "docx")
|
||||
?: Path(docxPath ?: "docx/$folderId")
|
||||
?: Path(docxPath ?: "docx/${folderId ?: project}")
|
||||
docxTargetPath.createDirectories()
|
||||
convertToDocX(markdownPath, docxTargetPath)
|
||||
}
|
||||
@ -123,7 +123,7 @@ private class ExtractDocumentsCommand : ExtractCommand("docs", "Extract document
|
||||
|
||||
companion object {
|
||||
private val urlRegex =
|
||||
"""(?<spaceUrl>https?:\/\/[^\/]*)\/p\/(?<projectName>[^\/]*)\/.*-(?<folderId>.*)${'$'}""".toRegex()
|
||||
"""(?<spaceUrl>https?:\/\/[^\/]*)\/p\/(?<projectName>[^\/]*)(\/.*-(?<folderId>.*)${'$'})?""".toRegex()
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user