Full extraction

This commit is contained in:
Alexander Nozik 2022-04-26 21:20:46 +03:00
parent 5cd96ae106
commit 39cdff3726
No known key found for this signature in database
GPG Key ID: F7FCF2DD25C71357
5 changed files with 148 additions and 49 deletions

3
.gitignore vendored
View File

@ -1,3 +1,4 @@
.idea/ .idea/
.gradle/ .gradle/
build/ build/
output/

View File

@ -20,7 +20,7 @@ dependencies {
implementation("io.ktor:ktor-client-cio:$ktorVersion") implementation("io.ktor:ktor-client-cio:$ktorVersion")
implementation("io.ktor:ktor-client-auth:$ktorVersion") implementation("io.ktor:ktor-client-auth:$ktorVersion")
implementation("org.jetbrains.kotlinx:kotlinx-cli:0.3.4") implementation("org.jetbrains.kotlinx:kotlinx-cli:0.3.4")
implementation("org.jetbrains:space-sdk-jvm:86641-beta") implementation("org.jetbrains:space-sdk-jvm:98244-beta")
implementation("ch.qos.logback:logback-classic:1.2.10") implementation("ch.qos.logback:logback-classic:1.2.10")
testImplementation(kotlin("test")) testImplementation(kotlin("test"))
} }

View File

@ -1,5 +1,5 @@
distributionBase=GRADLE_USER_HOME distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists distributionPath=wrapper/dists
distributionUrl=https\://services.gradle.org/distributions/gradle-7.3.2-bin.zip distributionUrl=https\://services.gradle.org/distributions/gradle-7.4.2-bin.zip
zipStoreBase=GRADLE_USER_HOME zipStoreBase=GRADLE_USER_HOME
zipStorePath=wrapper/dists zipStorePath=wrapper/dists

View File

@ -1,26 +1,40 @@
package ru.mipt.npm.space.documentextractor package ru.mipt.npm.space.documentextractor
import io.ktor.client.HttpClient
import io.ktor.client.engine.cio.CIO import io.ktor.client.engine.cio.CIO
import kotlinx.cli.ArgParser import kotlinx.cli.ArgParser
import kotlinx.cli.ArgType import kotlinx.cli.ArgType
import kotlinx.cli.required import kotlinx.cli.required
import space.jetbrains.api.runtime.SpaceHttpClient import kotlinx.coroutines.coroutineScope
import space.jetbrains.api.runtime.SpaceHttpClientWithCallContext import space.jetbrains.api.runtime.SpaceAppInstance
import space.jetbrains.api.runtime.withServiceAccountTokenSource import space.jetbrains.api.runtime.SpaceAuth
import space.jetbrains.api.runtime.SpaceClient
import space.jetbrains.api.runtime.ktorClientForSpace
import space.jetbrains.api.runtime.resources.projects
import space.jetbrains.api.runtime.types.FolderIdentifier
import space.jetbrains.api.runtime.types.ProjectIdentifier
import java.nio.file.Files
import java.nio.file.Path import java.nio.file.Path
import kotlin.io.path.exists
import kotlin.io.path.isDirectory
suspend fun main(args: Array<String>) { suspend fun main(args: Array<String>) {
val parser = ArgParser("space-document-extractor") val parser = ArgParser("space-document-extractor")
val path by parser.option(ArgType.String, description = "Input file or directory").required()
val spaceUrl by parser.option( val spaceUrl by parser.option(
ArgType.String, ArgType.String,
description = "Url of the space instance like 'https://mipt-npm.jetbrains.space'" description = "Url of the space instance like 'https://mipt-npm.jetbrains.space'"
).required() ).required()
val project by parser.option(
ArgType.String,
description = "The key of the exported project"
).required()
val path: String? by parser.option(ArgType.String, description = "Target directory. Default is current directory")
val folderId: String? by parser.option(
ArgType.String,
description = "FolderId for the folder to export"
)
val clientId by parser.option( val clientId by parser.option(
ArgType.String, ArgType.String,
description = "Space application client ID (if not defined, use environment value 'space.clientId')" description = "Space application client ID (if not defined, use environment value 'space.clientId')"
@ -31,26 +45,29 @@ suspend fun main(args: Array<String>) {
description = "Space application client secret (if not defined, use environment value 'space.clientSecret')" description = "Space application client secret (if not defined, use environment value 'space.clientSecret')"
) )
parser.parse(args) parser.parse(args)
val pathValue: Path = Path.of(path) val target: Path = path?.let { Path.of(path) } ?: Path.of("output/$project")
if (!pathValue.exists()) { Files.createDirectories(target)
error("File or directory not found at $path")
}
val client = HttpClient(CIO)
val space: SpaceHttpClientWithCallContext = SpaceHttpClient(client).withServiceAccountTokenSource( val space: SpaceClient = SpaceClient(
clientId = clientId ?: System.getProperty("space.clientId"), ktorClientForSpace(CIO),
clientSecret = clientSecret ?: System.getProperty("space.clientSecret"), SpaceAppInstance(
serverUrl = "https://mipt-npm.jetbrains.space" clientId ?: System.getProperty("space.clientId"),
clientSecret ?: System.getProperty("space.clientSecret"),
spaceUrl
),
SpaceAuth.ClientCredentials()
) )
if (pathValue.isDirectory()) { coroutineScope {
space.processDirectory(client, spaceUrl, pathValue) println("Processing project \"${space.projects.getProject(ProjectIdentifier.Key(project)).name}\"")
} else { space.downloadAndProcessDocumentsInProject(
space.processDocument(client, spaceUrl, pathValue) target,
ProjectIdentifier.Key(project),
folderId?.let { FolderIdentifier.Id(it) } ?: FolderIdentifier.Root
)
} }
} }

View File

@ -1,7 +1,5 @@
package ru.mipt.npm.space.documentextractor package ru.mipt.npm.space.documentextractor
import io.ktor.client.HttpClient
import io.ktor.client.request.HttpRequestBuilder
import io.ktor.client.request.header import io.ktor.client.request.header
import io.ktor.client.request.request import io.ktor.client.request.request
import io.ktor.client.request.url import io.ktor.client.request.url
@ -9,38 +7,59 @@ import io.ktor.client.statement.HttpResponse
import io.ktor.client.statement.readBytes import io.ktor.client.statement.readBytes
import io.ktor.http.HttpHeaders import io.ktor.http.HttpHeaders
import io.ktor.http.HttpMethod import io.ktor.http.HttpMethod
import kotlinx.coroutines.runBlocking import io.ktor.utils.io.jvm.javaio.copyTo
import kotlinx.coroutines.Dispatchers
import kotlinx.coroutines.coroutineScope
import kotlinx.coroutines.launch
import org.slf4j.LoggerFactory import org.slf4j.LoggerFactory
import space.jetbrains.api.runtime.SpaceHttpClientWithCallContext import space.jetbrains.api.runtime.Batch
import space.jetbrains.api.runtime.SpaceClient
import space.jetbrains.api.runtime.resources.projects
import space.jetbrains.api.runtime.types.*
import java.nio.file.Files import java.nio.file.Files
import java.nio.file.Path import java.nio.file.Path
import kotlin.io.path.isDirectory import kotlin.io.path.*
import kotlin.io.path.readText
import kotlin.io.path.writeBytes
import kotlin.io.path.writeText
import kotlin.streams.toList import kotlin.streams.toList
fun SpaceHttpClientWithCallContext.extractImage( private val logger by lazy { LoggerFactory.getLogger("space-extractor") }
client: HttpClient,
spaceUrl: String, internal suspend fun SpaceClient.extractImage(
parent: Path, parent: Path,
imageId: String, imageId: String,
imageFileName: String, imageFileName: String,
) = runBlocking { ) {
val request = HttpRequestBuilder().apply { logger.info("Downloading image file $imageFileName to $parent")
val token = callContext.tokenSource.token() val response = ktorClient.request<HttpResponse> {
url("$spaceUrl/d/$imageId") url("${server.serverUrl}/d/$imageId")
method = HttpMethod.Get method = HttpMethod.Get
header(HttpHeaders.Authorization, "Bearer ${token.accessToken}") header(HttpHeaders.Authorization, "Bearer ${token().accessToken}")
} }
val response = client.request<HttpResponse>(request)
val file = parent.resolve("images/$imageFileName") val file = parent.resolve("images/$imageFileName")
file.parent.createDirectories()
file.writeBytes(response.readBytes()) file.writeBytes(response.readBytes())
} }
internal suspend fun SpaceClient.extractDocument(
parent: Path,
documentId: String,
documentFileName: String,
) {
//https://mipt-npm.jetbrains.space/drive/files/3qe9i43qtPq2
logger.info("Downloading document file $documentFileName to $parent")
val response = ktorClient.request<HttpResponse> {
url("${server.serverUrl}/drive/files/$documentId")
method = HttpMethod.Get
header(HttpHeaders.Authorization, "Bearer ${token().accessToken}")
}
val file = parent.resolve(documentFileName)
file.outputStream().use {
response.content.copyTo(it)
}
}
private val regex = """!\[(?<alt>.*)]\(/d/(?<id>.*)\?f=0""".toRegex() private val regex = """!\[(?<alt>.*)]\(/d/(?<id>.*)\?f=0""".toRegex()
fun SpaceHttpClientWithCallContext.processDocument(client: HttpClient, spaceUrl: String, path: Path) { internal suspend fun SpaceClient.processMarkdownDocument(path: Path) = coroutineScope{
val documentBody = path.readText() val documentBody = path.readText()
val logger = LoggerFactory.getLogger("space-document-extractor") val logger = LoggerFactory.getLogger("space-document-extractor")
logger.info("Processing file $path...") logger.info("Processing file $path...")
@ -48,25 +67,87 @@ fun SpaceHttpClientWithCallContext.processDocument(client: HttpClient, spaceUrl:
val id = it.groups["id"]?.value ?: error("Unexpected reference format: ${it.value}") val id = it.groups["id"]?.value ?: error("Unexpected reference format: ${it.value}")
val alt = it.groups["alt"]?.value val alt = it.groups["alt"]?.value
logger.info("Downloading image $id as images/$id") logger.info("Downloading image $id as images/$id")
extractImage(client, spaceUrl, path.parent, id, id) launch(Dispatchers.IO) {
extractImage(path.parent, id, id)
}
"![$alt](images/$id" "![$alt](images/$id"
} }
path.writeText(newText) path.writeText(newText)
} }
fun SpaceHttpClientWithCallContext.processDirectory( /**
client: HttpClient, * Download images for markdown documents in the directory
spaceUrl: String, */
internal suspend fun SpaceClient.processMarkdownInDirectory(
path: Path, path: Path,
fileExtension: String = ".md", fileExtension: String = ".md",
recursive: Boolean = true, recursive: Boolean = true,
) { ) {
Files.list(path).toList().forEach { Files.list(path).toList().forEach {
if (it.toString().endsWith(fileExtension)) { if (it.toString().endsWith(fileExtension)) {
processDocument(client, spaceUrl, it) logger.info("Updating links in a markdown $it")
processMarkdownDocument(it)
} else if (recursive && it.isDirectory()) { } else if (recursive && it.isDirectory()) {
processDirectory(client, spaceUrl, it, fileExtension) processMarkdownInDirectory(it, fileExtension)
} }
} }
}
internal suspend fun SpaceClient.downloadDocument(
directory: Path,
document: Document,
) = coroutineScope {
when (val body = document.documentBody) {
is FileDocumentBody -> {
launch(Dispatchers.IO) {
extractDocument(directory, document.id, document.title)
}
}
is TextDocument -> {
val markdownFilePath = directory.resolve(document.title + ".md")
markdownFilePath.writeText(body.text, Charsets.UTF_8)
}
else -> {
LoggerFactory.getLogger("space-extractor")
.warn("Can't extract document ${document.title} with type ${document.bodyType}")
}
}
}
internal suspend fun SpaceClient.downloadDocumentFolder(
directory: Path,
projectId: ProjectIdentifier,
folderId: FolderIdentifier,
) {
directory.createDirectories()
logger.info("Processing folder ${folderId.compactId} to $directory")
val documents = projects.documents.folders.documents.listDocumentsInFolder(projectId, folderId) {
id()
}
documents.data.forEach {
val document = projects.documents.getDocument(projectId, it.id) {
id()
title()
documentBody()
bodyType()
}
downloadDocument(directory, document)
}
val subFolders: Batch<DocumentFolder> = projects.documents.folders.subfolders.listSubfolders(projectId, folderId)
subFolders.data.forEach {
val subPath = directory.resolve(it.name)
downloadDocumentFolder(subPath, projectId, FolderIdentifier.Id(it.id))
}
}
suspend fun SpaceClient.downloadAndProcessDocumentsInProject(
directory: Path,
projectId: ProjectIdentifier,
rootFolder: FolderIdentifier = FolderIdentifier.Root,
) {
logger.info("Processing project ${projectId.compactId} to $directory")
downloadDocumentFolder(directory, projectId, rootFolder)
processMarkdownInDirectory(directory)
} }