Full extraction
This commit is contained in:
parent
5cd96ae106
commit
39cdff3726
3
.gitignore
vendored
3
.gitignore
vendored
@ -1,3 +1,4 @@
|
||||
.idea/
|
||||
.gradle/
|
||||
build/
|
||||
build/
|
||||
output/
|
@ -20,7 +20,7 @@ dependencies {
|
||||
implementation("io.ktor:ktor-client-cio:$ktorVersion")
|
||||
implementation("io.ktor:ktor-client-auth:$ktorVersion")
|
||||
implementation("org.jetbrains.kotlinx:kotlinx-cli:0.3.4")
|
||||
implementation("org.jetbrains:space-sdk-jvm:86641-beta")
|
||||
implementation("org.jetbrains:space-sdk-jvm:98244-beta")
|
||||
implementation("ch.qos.logback:logback-classic:1.2.10")
|
||||
testImplementation(kotlin("test"))
|
||||
}
|
||||
|
2
gradle/wrapper/gradle-wrapper.properties
vendored
2
gradle/wrapper/gradle-wrapper.properties
vendored
@ -1,5 +1,5 @@
|
||||
distributionBase=GRADLE_USER_HOME
|
||||
distributionPath=wrapper/dists
|
||||
distributionUrl=https\://services.gradle.org/distributions/gradle-7.3.2-bin.zip
|
||||
distributionUrl=https\://services.gradle.org/distributions/gradle-7.4.2-bin.zip
|
||||
zipStoreBase=GRADLE_USER_HOME
|
||||
zipStorePath=wrapper/dists
|
||||
|
@ -1,26 +1,40 @@
|
||||
package ru.mipt.npm.space.documentextractor
|
||||
|
||||
import io.ktor.client.HttpClient
|
||||
import io.ktor.client.engine.cio.CIO
|
||||
import kotlinx.cli.ArgParser
|
||||
import kotlinx.cli.ArgType
|
||||
import kotlinx.cli.required
|
||||
import space.jetbrains.api.runtime.SpaceHttpClient
|
||||
import space.jetbrains.api.runtime.SpaceHttpClientWithCallContext
|
||||
import space.jetbrains.api.runtime.withServiceAccountTokenSource
|
||||
import kotlinx.coroutines.coroutineScope
|
||||
import space.jetbrains.api.runtime.SpaceAppInstance
|
||||
import space.jetbrains.api.runtime.SpaceAuth
|
||||
import space.jetbrains.api.runtime.SpaceClient
|
||||
import space.jetbrains.api.runtime.ktorClientForSpace
|
||||
import space.jetbrains.api.runtime.resources.projects
|
||||
import space.jetbrains.api.runtime.types.FolderIdentifier
|
||||
import space.jetbrains.api.runtime.types.ProjectIdentifier
|
||||
import java.nio.file.Files
|
||||
import java.nio.file.Path
|
||||
import kotlin.io.path.exists
|
||||
import kotlin.io.path.isDirectory
|
||||
|
||||
suspend fun main(args: Array<String>) {
|
||||
val parser = ArgParser("space-document-extractor")
|
||||
val path by parser.option(ArgType.String, description = "Input file or directory").required()
|
||||
|
||||
val spaceUrl by parser.option(
|
||||
ArgType.String,
|
||||
description = "Url of the space instance like 'https://mipt-npm.jetbrains.space'"
|
||||
).required()
|
||||
|
||||
val project by parser.option(
|
||||
ArgType.String,
|
||||
description = "The key of the exported project"
|
||||
).required()
|
||||
|
||||
val path: String? by parser.option(ArgType.String, description = "Target directory. Default is current directory")
|
||||
|
||||
val folderId: String? by parser.option(
|
||||
ArgType.String,
|
||||
description = "FolderId for the folder to export"
|
||||
)
|
||||
|
||||
val clientId by parser.option(
|
||||
ArgType.String,
|
||||
description = "Space application client ID (if not defined, use environment value 'space.clientId')"
|
||||
@ -31,26 +45,29 @@ suspend fun main(args: Array<String>) {
|
||||
description = "Space application client secret (if not defined, use environment value 'space.clientSecret')"
|
||||
)
|
||||
|
||||
|
||||
parser.parse(args)
|
||||
|
||||
val pathValue: Path = Path.of(path)
|
||||
val target: Path = path?.let { Path.of(path) } ?: Path.of("output/$project")
|
||||
|
||||
if (!pathValue.exists()) {
|
||||
error("File or directory not found at $path")
|
||||
}
|
||||
Files.createDirectories(target)
|
||||
|
||||
val client = HttpClient(CIO)
|
||||
val space: SpaceHttpClientWithCallContext = SpaceHttpClient(client).withServiceAccountTokenSource(
|
||||
clientId = clientId ?: System.getProperty("space.clientId"),
|
||||
clientSecret = clientSecret ?: System.getProperty("space.clientSecret"),
|
||||
serverUrl = "https://mipt-npm.jetbrains.space"
|
||||
|
||||
val space: SpaceClient = SpaceClient(
|
||||
ktorClientForSpace(CIO),
|
||||
SpaceAppInstance(
|
||||
clientId ?: System.getProperty("space.clientId"),
|
||||
clientSecret ?: System.getProperty("space.clientSecret"),
|
||||
spaceUrl
|
||||
),
|
||||
SpaceAuth.ClientCredentials()
|
||||
)
|
||||
|
||||
if (pathValue.isDirectory()) {
|
||||
space.processDirectory(client, spaceUrl, pathValue)
|
||||
} else {
|
||||
space.processDocument(client, spaceUrl, pathValue)
|
||||
coroutineScope {
|
||||
println("Processing project \"${space.projects.getProject(ProjectIdentifier.Key(project)).name}\"")
|
||||
space.downloadAndProcessDocumentsInProject(
|
||||
target,
|
||||
ProjectIdentifier.Key(project),
|
||||
folderId?.let { FolderIdentifier.Id(it) } ?: FolderIdentifier.Root
|
||||
)
|
||||
}
|
||||
|
||||
}
|
@ -1,7 +1,5 @@
|
||||
package ru.mipt.npm.space.documentextractor
|
||||
|
||||
import io.ktor.client.HttpClient
|
||||
import io.ktor.client.request.HttpRequestBuilder
|
||||
import io.ktor.client.request.header
|
||||
import io.ktor.client.request.request
|
||||
import io.ktor.client.request.url
|
||||
@ -9,38 +7,59 @@ import io.ktor.client.statement.HttpResponse
|
||||
import io.ktor.client.statement.readBytes
|
||||
import io.ktor.http.HttpHeaders
|
||||
import io.ktor.http.HttpMethod
|
||||
import kotlinx.coroutines.runBlocking
|
||||
import io.ktor.utils.io.jvm.javaio.copyTo
|
||||
import kotlinx.coroutines.Dispatchers
|
||||
import kotlinx.coroutines.coroutineScope
|
||||
import kotlinx.coroutines.launch
|
||||
import org.slf4j.LoggerFactory
|
||||
import space.jetbrains.api.runtime.SpaceHttpClientWithCallContext
|
||||
import space.jetbrains.api.runtime.Batch
|
||||
import space.jetbrains.api.runtime.SpaceClient
|
||||
import space.jetbrains.api.runtime.resources.projects
|
||||
import space.jetbrains.api.runtime.types.*
|
||||
import java.nio.file.Files
|
||||
import java.nio.file.Path
|
||||
import kotlin.io.path.isDirectory
|
||||
import kotlin.io.path.readText
|
||||
import kotlin.io.path.writeBytes
|
||||
import kotlin.io.path.writeText
|
||||
import kotlin.io.path.*
|
||||
import kotlin.streams.toList
|
||||
|
||||
fun SpaceHttpClientWithCallContext.extractImage(
|
||||
client: HttpClient,
|
||||
spaceUrl: String,
|
||||
private val logger by lazy { LoggerFactory.getLogger("space-extractor") }
|
||||
|
||||
internal suspend fun SpaceClient.extractImage(
|
||||
parent: Path,
|
||||
imageId: String,
|
||||
imageFileName: String,
|
||||
) = runBlocking {
|
||||
val request = HttpRequestBuilder().apply {
|
||||
val token = callContext.tokenSource.token()
|
||||
url("$spaceUrl/d/$imageId")
|
||||
) {
|
||||
logger.info("Downloading image file $imageFileName to $parent")
|
||||
val response = ktorClient.request<HttpResponse> {
|
||||
url("${server.serverUrl}/d/$imageId")
|
||||
method = HttpMethod.Get
|
||||
header(HttpHeaders.Authorization, "Bearer ${token.accessToken}")
|
||||
header(HttpHeaders.Authorization, "Bearer ${token().accessToken}")
|
||||
}
|
||||
val response = client.request<HttpResponse>(request)
|
||||
val file = parent.resolve("images/$imageFileName")
|
||||
file.parent.createDirectories()
|
||||
file.writeBytes(response.readBytes())
|
||||
}
|
||||
|
||||
internal suspend fun SpaceClient.extractDocument(
|
||||
parent: Path,
|
||||
documentId: String,
|
||||
documentFileName: String,
|
||||
) {
|
||||
//https://mipt-npm.jetbrains.space/drive/files/3qe9i43qtPq2
|
||||
logger.info("Downloading document file $documentFileName to $parent")
|
||||
val response = ktorClient.request<HttpResponse> {
|
||||
url("${server.serverUrl}/drive/files/$documentId")
|
||||
method = HttpMethod.Get
|
||||
header(HttpHeaders.Authorization, "Bearer ${token().accessToken}")
|
||||
}
|
||||
val file = parent.resolve(documentFileName)
|
||||
file.outputStream().use {
|
||||
response.content.copyTo(it)
|
||||
}
|
||||
}
|
||||
|
||||
private val regex = """!\[(?<alt>.*)]\(/d/(?<id>.*)\?f=0""".toRegex()
|
||||
|
||||
fun SpaceHttpClientWithCallContext.processDocument(client: HttpClient, spaceUrl: String, path: Path) {
|
||||
internal suspend fun SpaceClient.processMarkdownDocument(path: Path) = coroutineScope{
|
||||
val documentBody = path.readText()
|
||||
val logger = LoggerFactory.getLogger("space-document-extractor")
|
||||
logger.info("Processing file $path...")
|
||||
@ -48,25 +67,87 @@ fun SpaceHttpClientWithCallContext.processDocument(client: HttpClient, spaceUrl:
|
||||
val id = it.groups["id"]?.value ?: error("Unexpected reference format: ${it.value}")
|
||||
val alt = it.groups["alt"]?.value
|
||||
logger.info("Downloading image $id as images/$id")
|
||||
extractImage(client, spaceUrl, path.parent, id, id)
|
||||
launch(Dispatchers.IO) {
|
||||
extractImage(path.parent, id, id)
|
||||
}
|
||||
"![$alt](images/$id"
|
||||
}
|
||||
path.writeText(newText)
|
||||
|
||||
}
|
||||
|
||||
fun SpaceHttpClientWithCallContext.processDirectory(
|
||||
client: HttpClient,
|
||||
spaceUrl: String,
|
||||
/**
|
||||
* Download images for markdown documents in the directory
|
||||
*/
|
||||
internal suspend fun SpaceClient.processMarkdownInDirectory(
|
||||
path: Path,
|
||||
fileExtension: String = ".md",
|
||||
recursive: Boolean = true,
|
||||
) {
|
||||
Files.list(path).toList().forEach {
|
||||
if (it.toString().endsWith(fileExtension)) {
|
||||
processDocument(client, spaceUrl, it)
|
||||
logger.info("Updating links in a markdown $it")
|
||||
processMarkdownDocument(it)
|
||||
} else if (recursive && it.isDirectory()) {
|
||||
processDirectory(client, spaceUrl, it, fileExtension)
|
||||
processMarkdownInDirectory(it, fileExtension)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
internal suspend fun SpaceClient.downloadDocument(
|
||||
directory: Path,
|
||||
document: Document,
|
||||
) = coroutineScope {
|
||||
when (val body = document.documentBody) {
|
||||
is FileDocumentBody -> {
|
||||
launch(Dispatchers.IO) {
|
||||
extractDocument(directory, document.id, document.title)
|
||||
}
|
||||
}
|
||||
is TextDocument -> {
|
||||
val markdownFilePath = directory.resolve(document.title + ".md")
|
||||
markdownFilePath.writeText(body.text, Charsets.UTF_8)
|
||||
}
|
||||
else -> {
|
||||
LoggerFactory.getLogger("space-extractor")
|
||||
.warn("Can't extract document ${document.title} with type ${document.bodyType}")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
internal suspend fun SpaceClient.downloadDocumentFolder(
|
||||
directory: Path,
|
||||
projectId: ProjectIdentifier,
|
||||
folderId: FolderIdentifier,
|
||||
) {
|
||||
directory.createDirectories()
|
||||
logger.info("Processing folder ${folderId.compactId} to $directory")
|
||||
val documents = projects.documents.folders.documents.listDocumentsInFolder(projectId, folderId) {
|
||||
id()
|
||||
}
|
||||
documents.data.forEach {
|
||||
val document = projects.documents.getDocument(projectId, it.id) {
|
||||
id()
|
||||
title()
|
||||
documentBody()
|
||||
bodyType()
|
||||
}
|
||||
downloadDocument(directory, document)
|
||||
}
|
||||
|
||||
val subFolders: Batch<DocumentFolder> = projects.documents.folders.subfolders.listSubfolders(projectId, folderId)
|
||||
subFolders.data.forEach {
|
||||
val subPath = directory.resolve(it.name)
|
||||
downloadDocumentFolder(subPath, projectId, FolderIdentifier.Id(it.id))
|
||||
}
|
||||
}
|
||||
|
||||
suspend fun SpaceClient.downloadAndProcessDocumentsInProject(
|
||||
directory: Path,
|
||||
projectId: ProjectIdentifier,
|
||||
rootFolder: FolderIdentifier = FolderIdentifier.Root,
|
||||
) {
|
||||
logger.info("Processing project ${projectId.compactId} to $directory")
|
||||
downloadDocumentFolder(directory, projectId, rootFolder)
|
||||
processMarkdownInDirectory(directory)
|
||||
}
|
Loading…
Reference in New Issue
Block a user