Fix document export for non-markdown and inline image processing.
This commit is contained in:
parent
5f601cbd72
commit
46e3fb966c
@ -3,11 +3,9 @@ package center.sciprog.space.documentextractor
|
|||||||
import io.ktor.client.request.header
|
import io.ktor.client.request.header
|
||||||
import io.ktor.client.request.request
|
import io.ktor.client.request.request
|
||||||
import io.ktor.client.request.url
|
import io.ktor.client.request.url
|
||||||
import io.ktor.client.statement.bodyAsChannel
|
|
||||||
import io.ktor.client.statement.readBytes
|
import io.ktor.client.statement.readBytes
|
||||||
import io.ktor.http.HttpHeaders
|
import io.ktor.http.HttpHeaders
|
||||||
import io.ktor.http.HttpMethod
|
import io.ktor.http.HttpMethod
|
||||||
import io.ktor.utils.io.jvm.javaio.copyTo
|
|
||||||
import kotlinx.coroutines.Dispatchers
|
import kotlinx.coroutines.Dispatchers
|
||||||
import kotlinx.coroutines.coroutineScope
|
import kotlinx.coroutines.coroutineScope
|
||||||
import kotlinx.coroutines.launch
|
import kotlinx.coroutines.launch
|
||||||
@ -17,21 +15,21 @@ import space.jetbrains.api.runtime.Batch
|
|||||||
import space.jetbrains.api.runtime.SpaceClient
|
import space.jetbrains.api.runtime.SpaceClient
|
||||||
import space.jetbrains.api.runtime.resources.projects
|
import space.jetbrains.api.runtime.resources.projects
|
||||||
import space.jetbrains.api.runtime.types.*
|
import space.jetbrains.api.runtime.types.*
|
||||||
import java.nio.file.Files
|
|
||||||
import java.nio.file.Path
|
import java.nio.file.Path
|
||||||
import kotlin.io.path.*
|
import kotlin.io.path.createDirectories
|
||||||
import kotlin.streams.toList
|
import kotlin.io.path.writeBytes
|
||||||
|
import kotlin.io.path.writeText
|
||||||
|
|
||||||
internal val logger by lazy { LoggerFactory.getLogger("space-extractor") }
|
internal val logger by lazy { LoggerFactory.getLogger("space-extractor") }
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Extract single attachment image
|
* Extract single attachment image
|
||||||
*/
|
*/
|
||||||
internal suspend fun SpaceClient.extractImage(
|
internal suspend fun SpaceClient.extractAttachment(
|
||||||
imageFile: Path,
|
imageFile: Path,
|
||||||
imageId: String,
|
imageId: String,
|
||||||
) {
|
) {
|
||||||
logger.info("Downloading image file to $imageFile")
|
logger.info("Downloading attachment file to $imageFile")
|
||||||
val response = ktorClient.request {
|
val response = ktorClient.request {
|
||||||
url("${server.serverUrl}/d/$imageId")
|
url("${server.serverUrl}/d/$imageId")
|
||||||
method = HttpMethod.Get
|
method = HttpMethod.Get
|
||||||
@ -54,57 +52,10 @@ internal suspend fun SpaceClient.extractFile(
|
|||||||
method = HttpMethod.Get
|
method = HttpMethod.Get
|
||||||
header(HttpHeaders.Authorization, "Bearer ${token().accessToken}")
|
header(HttpHeaders.Authorization, "Bearer ${token().accessToken}")
|
||||||
}
|
}
|
||||||
documentFile.outputStream().use {
|
documentFile.writeBytes(response.readBytes())
|
||||||
response.bodyAsChannel().copyTo(it)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private val regex = """!\[(?<alt>.*)]\(/d/(?<id>.*)\?f=0""".toRegex()
|
private val imageRegex = """!\[(?<alt>.*)]\(/d/(?<id>.*)\?f=0( "(?<name>.*)")?\)""".toRegex()
|
||||||
|
|
||||||
/**
|
|
||||||
* Post-process a Markdown document by downloading images and replacing links
|
|
||||||
*/
|
|
||||||
internal suspend fun SpaceClient.processMarkdownDocument(path: Path) = coroutineScope {
|
|
||||||
val documentBody = path.readText()
|
|
||||||
val logger = LoggerFactory.getLogger("space-document-extractor")
|
|
||||||
logger.info("Processing file $path...")
|
|
||||||
val imageDirectory = path.parent.resolve("images")
|
|
||||||
imageDirectory.createDirectories()
|
|
||||||
val newText = documentBody.replace(regex) {
|
|
||||||
val id = it.groups["id"]?.value ?: error("Unexpected reference format: ${it.value}")
|
|
||||||
val alt = it.groups["alt"]?.value
|
|
||||||
val imageName = alt?.let { "$id-$alt" } ?: id
|
|
||||||
val imageFile = imageDirectory.resolve(imageName)
|
|
||||||
logger.info("Downloading image $id as $imageFile")
|
|
||||||
launch(Dispatchers.IO) {
|
|
||||||
extractImage(imageFile, id)
|
|
||||||
}
|
|
||||||
"![$alt](images/$imageName"
|
|
||||||
}
|
|
||||||
path.writeText(newText)
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Download images for Markdown documents in the directory
|
|
||||||
*
|
|
||||||
* Images are always stored in the same directory as files themselves
|
|
||||||
*
|
|
||||||
* @param recursive turn recursive mode on or off
|
|
||||||
*/
|
|
||||||
internal suspend fun SpaceClient.processMarkdownInDirectory(
|
|
||||||
path: Path,
|
|
||||||
fileExtension: String = ".md",
|
|
||||||
recursive: Boolean = true,
|
|
||||||
) {
|
|
||||||
Files.list(path).toList().forEach {
|
|
||||||
if (it.toString().endsWith(fileExtension)) {
|
|
||||||
logger.info("Updating links in a markdown $it")
|
|
||||||
processMarkdownDocument(it)
|
|
||||||
} else if (recursive && it.isDirectory()) {
|
|
||||||
processMarkdownInDirectory(it, fileExtension)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Download single Space document
|
* Download single Space document
|
||||||
@ -131,19 +82,30 @@ internal suspend fun SpaceClient.downloadDocument(
|
|||||||
} catch (ex: Exception) {
|
} catch (ex: Exception) {
|
||||||
directory.resolve(document.id + ".md")
|
directory.resolve(document.id + ".md")
|
||||||
}
|
}
|
||||||
val content = body.docContent
|
when (val content = body.docContent) {
|
||||||
if (content is MdTextDocumentContent) {
|
is MdTextDocumentContent -> {
|
||||||
markdownFilePath.writeText(content.markdown, Charsets.UTF_8)
|
val imageDirectory = directory.resolve("images")
|
||||||
} else {
|
imageDirectory.createDirectories()
|
||||||
|
val newText = content.markdown.replace(imageRegex) {
|
||||||
|
val id = it.groups["id"]?.value ?: error("Unexpected reference format: ${it.value}")
|
||||||
|
val alt = it.groups["alt"]?.value?.ifBlank { null }
|
||||||
|
val name = it.groups["name"]?.value?.ifBlank { null }
|
||||||
|
val imageName = name?.let { "$id-$name" } ?: alt?.let { "$id-$alt" } ?: id
|
||||||
|
val imageFile = imageDirectory.resolve(imageName)
|
||||||
|
logger.info("Downloading image $id as $imageFile")
|
||||||
launch(Dispatchers.IO) {
|
launch(Dispatchers.IO) {
|
||||||
val filePath = try {
|
extractAttachment(imageFile, id)
|
||||||
directory.resolve(document.title)
|
|
||||||
} catch (ex: Exception){
|
|
||||||
directory.resolve(document.id)
|
|
||||||
}
|
}
|
||||||
extractFile(filePath, document.id)
|
"![${alt?:""}](images/$imageName)"
|
||||||
|
}
|
||||||
|
markdownFilePath.writeText(newText, Charsets.UTF_8)
|
||||||
|
}
|
||||||
|
|
||||||
|
else -> {
|
||||||
|
logger.error("Rich text documents are unsupported (${document.title})")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
else -> {
|
else -> {
|
||||||
@ -168,13 +130,30 @@ internal suspend fun SpaceClient.downloadDocumentFolder(
|
|||||||
}
|
}
|
||||||
documents.data.forEach {
|
documents.data.forEach {
|
||||||
val document = projects.documents.getDocument(projectId, it.id) {
|
val document = projects.documents.getDocument(projectId, it.id) {
|
||||||
id()
|
|
||||||
title()
|
|
||||||
body()
|
body()
|
||||||
bodyType()
|
title()
|
||||||
|
id()
|
||||||
}
|
}
|
||||||
|
val bodyInfo = document.body
|
||||||
|
if ((bodyInfo is TextDocumentHttpBody) && bodyInfo.docContent !is MdTextDocumentContent) {
|
||||||
|
//make a conversion to markdown before downloading
|
||||||
|
logger.info("Converting document ${document.title} in ${folderId.compactId} to markdown format")
|
||||||
|
val convertedDocument = projects.documents.updateDocument(
|
||||||
|
project = projectId,
|
||||||
|
documentId = it.id,
|
||||||
|
updateIn = TextDocumentBodyConvertTypeIn(
|
||||||
|
type = DraftDocumentType.MARKDOWN
|
||||||
|
)
|
||||||
|
) {
|
||||||
|
body()
|
||||||
|
title()
|
||||||
|
id()
|
||||||
|
}
|
||||||
|
downloadDocument(directory, convertedDocument)
|
||||||
|
} else {
|
||||||
downloadDocument(directory, document)
|
downloadDocument(directory, document)
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
val subFolders: Batch<DocumentFolder> = projects.documents.folders.subfolders.listSubfolders(projectId, folderId)
|
val subFolders: Batch<DocumentFolder> = projects.documents.folders.subfolders.listSubfolders(projectId, folderId)
|
||||||
subFolders.data.forEach {
|
subFolders.data.forEach {
|
||||||
@ -195,5 +174,5 @@ suspend fun SpaceClient.downloadAndProcessDocumentsInProject(
|
|||||||
) = withContext(Dispatchers.IO) {
|
) = withContext(Dispatchers.IO) {
|
||||||
logger.info("Processing project ${projectId.compactId} to $directory")
|
logger.info("Processing project ${projectId.compactId} to $directory")
|
||||||
downloadDocumentFolder(directory, projectId, rootFolder)
|
downloadDocumentFolder(directory, projectId, rootFolder)
|
||||||
processMarkdownInDirectory(directory)
|
// processMarkdownInDirectory(directory)
|
||||||
}
|
}
|
@ -57,7 +57,7 @@ private suspend fun SpaceClient.writeMessages(
|
|||||||
val fileId = attachment.id
|
val fileId = attachment.id
|
||||||
val name = "${attachment.id}-${attachment.filename}"
|
val name = "${attachment.id}-${attachment.filename}"
|
||||||
val file = attachmentsDirectory.resolve(name)
|
val file = attachmentsDirectory.resolve(name)
|
||||||
extractFile(file, fileId)
|
extractAttachment(file, fileId)
|
||||||
writer.appendLine("*Attachment*: [$name](attachments/$name)\n")
|
writer.appendLine("*Attachment*: [$name](attachments/$name)\n")
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -65,7 +65,7 @@ private suspend fun SpaceClient.writeMessages(
|
|||||||
val fileId = attachment.id
|
val fileId = attachment.id
|
||||||
val name = attachment.name?.let { "${attachment.id}-${attachment.name}"} ?: fileId
|
val name = attachment.name?.let { "${attachment.id}-${attachment.name}"} ?: fileId
|
||||||
val file = attachmentsDirectory.resolve(name)
|
val file = attachmentsDirectory.resolve(name)
|
||||||
extractFile(file, fileId)
|
extractAttachment(file, fileId)
|
||||||
writer.appendLine("*Attachment*: [$name](attachments/$name)\n")
|
writer.appendLine("*Attachment*: [$name](attachments/$name)\n")
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -73,7 +73,7 @@ private suspend fun SpaceClient.writeMessages(
|
|||||||
val fileId = attachment.id
|
val fileId = attachment.id
|
||||||
val name = attachment.name?.let { "${attachment.id}-${attachment.name}"} ?: fileId
|
val name = attachment.name?.let { "${attachment.id}-${attachment.name}"} ?: fileId
|
||||||
val file = attachmentsDirectory.resolve(name)
|
val file = attachmentsDirectory.resolve(name)
|
||||||
extractFile(file, fileId)
|
extractAttachment(file, fileId)
|
||||||
writer.appendLine("*Attachment*: [$name](attachments/$name)\n")
|
writer.appendLine("*Attachment*: [$name](attachments/$name)\n")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -104,6 +104,7 @@ suspend fun SpaceClient.extractMessages(
|
|||||||
name()
|
name()
|
||||||
username()
|
username()
|
||||||
}
|
}
|
||||||
|
name()
|
||||||
}
|
}
|
||||||
contact{
|
contact{
|
||||||
key()
|
key()
|
||||||
|
@ -83,7 +83,7 @@ private class ExtractDocumentsCommand : ExtractCommand("docs", "Extract document
|
|||||||
|
|
||||||
val folderId: String? = urlMatch.groups["folderId"]?.value
|
val folderId: String? = urlMatch.groups["folderId"]?.value
|
||||||
|
|
||||||
val markdownPath: Path = path?.let { Path(it) } ?: Path("markdown/$folderId")
|
val markdownPath: Path = path?.let { Path(it) } ?: Path("markdown/${folderId ?: project}")
|
||||||
|
|
||||||
Files.createDirectories(markdownPath)
|
Files.createDirectories(markdownPath)
|
||||||
|
|
||||||
@ -108,13 +108,13 @@ private class ExtractDocumentsCommand : ExtractCommand("docs", "Extract document
|
|||||||
)
|
)
|
||||||
if (html) {
|
if (html) {
|
||||||
val htmlTargetPath = path?.let { Path(it) }?.resolve(htmlPath ?: "html")
|
val htmlTargetPath = path?.let { Path(it) }?.resolve(htmlPath ?: "html")
|
||||||
?: Path(htmlPath ?: "html/$folderId")
|
?: Path(htmlPath ?: "html/${folderId ?: project}")
|
||||||
htmlTargetPath.createDirectories()
|
htmlTargetPath.createDirectories()
|
||||||
convertToHtml(markdownPath, htmlTargetPath)
|
convertToHtml(markdownPath, htmlTargetPath)
|
||||||
}
|
}
|
||||||
if (docx) {
|
if (docx) {
|
||||||
val docxTargetPath = path?.let { Path(it) }?.resolve(docxPath ?: "docx")
|
val docxTargetPath = path?.let { Path(it) }?.resolve(docxPath ?: "docx")
|
||||||
?: Path(docxPath ?: "docx/$folderId")
|
?: Path(docxPath ?: "docx/${folderId ?: project}")
|
||||||
docxTargetPath.createDirectories()
|
docxTargetPath.createDirectories()
|
||||||
convertToDocX(markdownPath, docxTargetPath)
|
convertToDocX(markdownPath, docxTargetPath)
|
||||||
}
|
}
|
||||||
@ -123,7 +123,7 @@ private class ExtractDocumentsCommand : ExtractCommand("docs", "Extract document
|
|||||||
|
|
||||||
companion object {
|
companion object {
|
||||||
private val urlRegex =
|
private val urlRegex =
|
||||||
"""(?<spaceUrl>https?:\/\/[^\/]*)\/p\/(?<projectName>[^\/]*)\/.*-(?<folderId>.*)${'$'}""".toRegex()
|
"""(?<spaceUrl>https?:\/\/[^\/]*)\/p\/(?<projectName>[^\/]*)(\/.*-(?<folderId>.*)${'$'})?""".toRegex()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user