diff --git a/.gitignore b/.gitignore index cd3d2f4..919304e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ .idea/ .gradle/ -build/ \ No newline at end of file +build/ +output/ \ No newline at end of file diff --git a/build.gradle.kts b/build.gradle.kts index 00a1f37..0073ce9 100644 --- a/build.gradle.kts +++ b/build.gradle.kts @@ -20,7 +20,7 @@ dependencies { implementation("io.ktor:ktor-client-cio:$ktorVersion") implementation("io.ktor:ktor-client-auth:$ktorVersion") implementation("org.jetbrains.kotlinx:kotlinx-cli:0.3.4") - implementation("org.jetbrains:space-sdk-jvm:86641-beta") + implementation("org.jetbrains:space-sdk-jvm:98244-beta") implementation("ch.qos.logback:logback-classic:1.2.10") testImplementation(kotlin("test")) } diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties index d2880ba..aa991fc 100644 --- a/gradle/wrapper/gradle-wrapper.properties +++ b/gradle/wrapper/gradle-wrapper.properties @@ -1,5 +1,5 @@ distributionBase=GRADLE_USER_HOME distributionPath=wrapper/dists -distributionUrl=https\://services.gradle.org/distributions/gradle-7.3.2-bin.zip +distributionUrl=https\://services.gradle.org/distributions/gradle-7.4.2-bin.zip zipStoreBase=GRADLE_USER_HOME zipStorePath=wrapper/dists diff --git a/src/main/kotlin/main.kt b/src/main/kotlin/main.kt index 81055b7..96c2dfc 100644 --- a/src/main/kotlin/main.kt +++ b/src/main/kotlin/main.kt @@ -1,26 +1,40 @@ package ru.mipt.npm.space.documentextractor -import io.ktor.client.HttpClient import io.ktor.client.engine.cio.CIO import kotlinx.cli.ArgParser import kotlinx.cli.ArgType import kotlinx.cli.required -import space.jetbrains.api.runtime.SpaceHttpClient -import space.jetbrains.api.runtime.SpaceHttpClientWithCallContext -import space.jetbrains.api.runtime.withServiceAccountTokenSource +import kotlinx.coroutines.coroutineScope +import space.jetbrains.api.runtime.SpaceAppInstance +import space.jetbrains.api.runtime.SpaceAuth +import space.jetbrains.api.runtime.SpaceClient +import space.jetbrains.api.runtime.ktorClientForSpace +import space.jetbrains.api.runtime.resources.projects +import space.jetbrains.api.runtime.types.FolderIdentifier +import space.jetbrains.api.runtime.types.ProjectIdentifier +import java.nio.file.Files import java.nio.file.Path -import kotlin.io.path.exists -import kotlin.io.path.isDirectory suspend fun main(args: Array) { val parser = ArgParser("space-document-extractor") - val path by parser.option(ArgType.String, description = "Input file or directory").required() val spaceUrl by parser.option( ArgType.String, description = "Url of the space instance like 'https://mipt-npm.jetbrains.space'" ).required() + val project by parser.option( + ArgType.String, + description = "The key of the exported project" + ).required() + + val path: String? by parser.option(ArgType.String, description = "Target directory. Default is current directory") + + val folderId: String? by parser.option( + ArgType.String, + description = "FolderId for the folder to export" + ) + val clientId by parser.option( ArgType.String, description = "Space application client ID (if not defined, use environment value 'space.clientId')" @@ -31,26 +45,29 @@ suspend fun main(args: Array) { description = "Space application client secret (if not defined, use environment value 'space.clientSecret')" ) - parser.parse(args) - val pathValue: Path = Path.of(path) + val target: Path = path?.let { Path.of(path) } ?: Path.of("output/$project") - if (!pathValue.exists()) { - error("File or directory not found at $path") - } + Files.createDirectories(target) - val client = HttpClient(CIO) - val space: SpaceHttpClientWithCallContext = SpaceHttpClient(client).withServiceAccountTokenSource( - clientId = clientId ?: System.getProperty("space.clientId"), - clientSecret = clientSecret ?: System.getProperty("space.clientSecret"), - serverUrl = "https://mipt-npm.jetbrains.space" + + val space: SpaceClient = SpaceClient( + ktorClientForSpace(CIO), + SpaceAppInstance( + clientId ?: System.getProperty("space.clientId"), + clientSecret ?: System.getProperty("space.clientSecret"), + spaceUrl + ), + SpaceAuth.ClientCredentials() ) - if (pathValue.isDirectory()) { - space.processDirectory(client, spaceUrl, pathValue) - } else { - space.processDocument(client, spaceUrl, pathValue) + coroutineScope { + println("Processing project \"${space.projects.getProject(ProjectIdentifier.Key(project)).name}\"") + space.downloadAndProcessDocumentsInProject( + target, + ProjectIdentifier.Key(project), + folderId?.let { FolderIdentifier.Id(it) } ?: FolderIdentifier.Root + ) } - } \ No newline at end of file diff --git a/src/main/kotlin/process.kt b/src/main/kotlin/process.kt index 3ec6b86..ed5ae93 100644 --- a/src/main/kotlin/process.kt +++ b/src/main/kotlin/process.kt @@ -1,7 +1,5 @@ package ru.mipt.npm.space.documentextractor -import io.ktor.client.HttpClient -import io.ktor.client.request.HttpRequestBuilder import io.ktor.client.request.header import io.ktor.client.request.request import io.ktor.client.request.url @@ -9,38 +7,59 @@ import io.ktor.client.statement.HttpResponse import io.ktor.client.statement.readBytes import io.ktor.http.HttpHeaders import io.ktor.http.HttpMethod -import kotlinx.coroutines.runBlocking +import io.ktor.utils.io.jvm.javaio.copyTo +import kotlinx.coroutines.Dispatchers +import kotlinx.coroutines.coroutineScope +import kotlinx.coroutines.launch import org.slf4j.LoggerFactory -import space.jetbrains.api.runtime.SpaceHttpClientWithCallContext +import space.jetbrains.api.runtime.Batch +import space.jetbrains.api.runtime.SpaceClient +import space.jetbrains.api.runtime.resources.projects +import space.jetbrains.api.runtime.types.* import java.nio.file.Files import java.nio.file.Path -import kotlin.io.path.isDirectory -import kotlin.io.path.readText -import kotlin.io.path.writeBytes -import kotlin.io.path.writeText +import kotlin.io.path.* import kotlin.streams.toList -fun SpaceHttpClientWithCallContext.extractImage( - client: HttpClient, - spaceUrl: String, +private val logger by lazy { LoggerFactory.getLogger("space-extractor") } + +internal suspend fun SpaceClient.extractImage( parent: Path, imageId: String, imageFileName: String, -) = runBlocking { - val request = HttpRequestBuilder().apply { - val token = callContext.tokenSource.token() - url("$spaceUrl/d/$imageId") +) { + logger.info("Downloading image file $imageFileName to $parent") + val response = ktorClient.request { + url("${server.serverUrl}/d/$imageId") method = HttpMethod.Get - header(HttpHeaders.Authorization, "Bearer ${token.accessToken}") + header(HttpHeaders.Authorization, "Bearer ${token().accessToken}") } - val response = client.request(request) val file = parent.resolve("images/$imageFileName") + file.parent.createDirectories() file.writeBytes(response.readBytes()) } +internal suspend fun SpaceClient.extractDocument( + parent: Path, + documentId: String, + documentFileName: String, +) { + //https://mipt-npm.jetbrains.space/drive/files/3qe9i43qtPq2 + logger.info("Downloading document file $documentFileName to $parent") + val response = ktorClient.request { + url("${server.serverUrl}/drive/files/$documentId") + method = HttpMethod.Get + header(HttpHeaders.Authorization, "Bearer ${token().accessToken}") + } + val file = parent.resolve(documentFileName) + file.outputStream().use { + response.content.copyTo(it) + } +} + private val regex = """!\[(?.*)]\(/d/(?.*)\?f=0""".toRegex() -fun SpaceHttpClientWithCallContext.processDocument(client: HttpClient, spaceUrl: String, path: Path) { +internal suspend fun SpaceClient.processMarkdownDocument(path: Path) = coroutineScope{ val documentBody = path.readText() val logger = LoggerFactory.getLogger("space-document-extractor") logger.info("Processing file $path...") @@ -48,25 +67,87 @@ fun SpaceHttpClientWithCallContext.processDocument(client: HttpClient, spaceUrl: val id = it.groups["id"]?.value ?: error("Unexpected reference format: ${it.value}") val alt = it.groups["alt"]?.value logger.info("Downloading image $id as images/$id") - extractImage(client, spaceUrl, path.parent, id, id) + launch(Dispatchers.IO) { + extractImage(path.parent, id, id) + } "![$alt](images/$id" } path.writeText(newText) } -fun SpaceHttpClientWithCallContext.processDirectory( - client: HttpClient, - spaceUrl: String, +/** + * Download images for markdown documents in the directory + */ +internal suspend fun SpaceClient.processMarkdownInDirectory( path: Path, fileExtension: String = ".md", recursive: Boolean = true, ) { Files.list(path).toList().forEach { if (it.toString().endsWith(fileExtension)) { - processDocument(client, spaceUrl, it) + logger.info("Updating links in a markdown $it") + processMarkdownDocument(it) } else if (recursive && it.isDirectory()) { - processDirectory(client, spaceUrl, it, fileExtension) + processMarkdownInDirectory(it, fileExtension) } } +} + +internal suspend fun SpaceClient.downloadDocument( + directory: Path, + document: Document, +) = coroutineScope { + when (val body = document.documentBody) { + is FileDocumentBody -> { + launch(Dispatchers.IO) { + extractDocument(directory, document.id, document.title) + } + } + is TextDocument -> { + val markdownFilePath = directory.resolve(document.title + ".md") + markdownFilePath.writeText(body.text, Charsets.UTF_8) + } + else -> { + LoggerFactory.getLogger("space-extractor") + .warn("Can't extract document ${document.title} with type ${document.bodyType}") + } + } +} + +internal suspend fun SpaceClient.downloadDocumentFolder( + directory: Path, + projectId: ProjectIdentifier, + folderId: FolderIdentifier, +) { + directory.createDirectories() + logger.info("Processing folder ${folderId.compactId} to $directory") + val documents = projects.documents.folders.documents.listDocumentsInFolder(projectId, folderId) { + id() + } + documents.data.forEach { + val document = projects.documents.getDocument(projectId, it.id) { + id() + title() + documentBody() + bodyType() + } + downloadDocument(directory, document) + } + + val subFolders: Batch = projects.documents.folders.subfolders.listSubfolders(projectId, folderId) + subFolders.data.forEach { + val subPath = directory.resolve(it.name) + downloadDocumentFolder(subPath, projectId, FolderIdentifier.Id(it.id)) + } +} + +suspend fun SpaceClient.downloadAndProcessDocumentsInProject( + directory: Path, + projectId: ProjectIdentifier, + rootFolder: FolderIdentifier = FolderIdentifier.Root, +) { + logger.info("Processing project ${projectId.compactId} to $directory") + downloadDocumentFolder(directory, projectId, rootFolder) + processMarkdownInDirectory(directory) } \ No newline at end of file