Full extraction
This commit is contained in:
parent
5cd96ae106
commit
39cdff3726
3
.gitignore
vendored
3
.gitignore
vendored
@ -1,3 +1,4 @@
|
|||||||
.idea/
|
.idea/
|
||||||
.gradle/
|
.gradle/
|
||||||
build/
|
build/
|
||||||
|
output/
|
@ -20,7 +20,7 @@ dependencies {
|
|||||||
implementation("io.ktor:ktor-client-cio:$ktorVersion")
|
implementation("io.ktor:ktor-client-cio:$ktorVersion")
|
||||||
implementation("io.ktor:ktor-client-auth:$ktorVersion")
|
implementation("io.ktor:ktor-client-auth:$ktorVersion")
|
||||||
implementation("org.jetbrains.kotlinx:kotlinx-cli:0.3.4")
|
implementation("org.jetbrains.kotlinx:kotlinx-cli:0.3.4")
|
||||||
implementation("org.jetbrains:space-sdk-jvm:86641-beta")
|
implementation("org.jetbrains:space-sdk-jvm:98244-beta")
|
||||||
implementation("ch.qos.logback:logback-classic:1.2.10")
|
implementation("ch.qos.logback:logback-classic:1.2.10")
|
||||||
testImplementation(kotlin("test"))
|
testImplementation(kotlin("test"))
|
||||||
}
|
}
|
||||||
|
2
gradle/wrapper/gradle-wrapper.properties
vendored
2
gradle/wrapper/gradle-wrapper.properties
vendored
@ -1,5 +1,5 @@
|
|||||||
distributionBase=GRADLE_USER_HOME
|
distributionBase=GRADLE_USER_HOME
|
||||||
distributionPath=wrapper/dists
|
distributionPath=wrapper/dists
|
||||||
distributionUrl=https\://services.gradle.org/distributions/gradle-7.3.2-bin.zip
|
distributionUrl=https\://services.gradle.org/distributions/gradle-7.4.2-bin.zip
|
||||||
zipStoreBase=GRADLE_USER_HOME
|
zipStoreBase=GRADLE_USER_HOME
|
||||||
zipStorePath=wrapper/dists
|
zipStorePath=wrapper/dists
|
||||||
|
@ -1,26 +1,40 @@
|
|||||||
package ru.mipt.npm.space.documentextractor
|
package ru.mipt.npm.space.documentextractor
|
||||||
|
|
||||||
import io.ktor.client.HttpClient
|
|
||||||
import io.ktor.client.engine.cio.CIO
|
import io.ktor.client.engine.cio.CIO
|
||||||
import kotlinx.cli.ArgParser
|
import kotlinx.cli.ArgParser
|
||||||
import kotlinx.cli.ArgType
|
import kotlinx.cli.ArgType
|
||||||
import kotlinx.cli.required
|
import kotlinx.cli.required
|
||||||
import space.jetbrains.api.runtime.SpaceHttpClient
|
import kotlinx.coroutines.coroutineScope
|
||||||
import space.jetbrains.api.runtime.SpaceHttpClientWithCallContext
|
import space.jetbrains.api.runtime.SpaceAppInstance
|
||||||
import space.jetbrains.api.runtime.withServiceAccountTokenSource
|
import space.jetbrains.api.runtime.SpaceAuth
|
||||||
|
import space.jetbrains.api.runtime.SpaceClient
|
||||||
|
import space.jetbrains.api.runtime.ktorClientForSpace
|
||||||
|
import space.jetbrains.api.runtime.resources.projects
|
||||||
|
import space.jetbrains.api.runtime.types.FolderIdentifier
|
||||||
|
import space.jetbrains.api.runtime.types.ProjectIdentifier
|
||||||
|
import java.nio.file.Files
|
||||||
import java.nio.file.Path
|
import java.nio.file.Path
|
||||||
import kotlin.io.path.exists
|
|
||||||
import kotlin.io.path.isDirectory
|
|
||||||
|
|
||||||
suspend fun main(args: Array<String>) {
|
suspend fun main(args: Array<String>) {
|
||||||
val parser = ArgParser("space-document-extractor")
|
val parser = ArgParser("space-document-extractor")
|
||||||
val path by parser.option(ArgType.String, description = "Input file or directory").required()
|
|
||||||
|
|
||||||
val spaceUrl by parser.option(
|
val spaceUrl by parser.option(
|
||||||
ArgType.String,
|
ArgType.String,
|
||||||
description = "Url of the space instance like 'https://mipt-npm.jetbrains.space'"
|
description = "Url of the space instance like 'https://mipt-npm.jetbrains.space'"
|
||||||
).required()
|
).required()
|
||||||
|
|
||||||
|
val project by parser.option(
|
||||||
|
ArgType.String,
|
||||||
|
description = "The key of the exported project"
|
||||||
|
).required()
|
||||||
|
|
||||||
|
val path: String? by parser.option(ArgType.String, description = "Target directory. Default is current directory")
|
||||||
|
|
||||||
|
val folderId: String? by parser.option(
|
||||||
|
ArgType.String,
|
||||||
|
description = "FolderId for the folder to export"
|
||||||
|
)
|
||||||
|
|
||||||
val clientId by parser.option(
|
val clientId by parser.option(
|
||||||
ArgType.String,
|
ArgType.String,
|
||||||
description = "Space application client ID (if not defined, use environment value 'space.clientId')"
|
description = "Space application client ID (if not defined, use environment value 'space.clientId')"
|
||||||
@ -31,26 +45,29 @@ suspend fun main(args: Array<String>) {
|
|||||||
description = "Space application client secret (if not defined, use environment value 'space.clientSecret')"
|
description = "Space application client secret (if not defined, use environment value 'space.clientSecret')"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
parser.parse(args)
|
parser.parse(args)
|
||||||
|
|
||||||
val pathValue: Path = Path.of(path)
|
val target: Path = path?.let { Path.of(path) } ?: Path.of("output/$project")
|
||||||
|
|
||||||
if (!pathValue.exists()) {
|
Files.createDirectories(target)
|
||||||
error("File or directory not found at $path")
|
|
||||||
}
|
|
||||||
|
|
||||||
val client = HttpClient(CIO)
|
|
||||||
val space: SpaceHttpClientWithCallContext = SpaceHttpClient(client).withServiceAccountTokenSource(
|
val space: SpaceClient = SpaceClient(
|
||||||
clientId = clientId ?: System.getProperty("space.clientId"),
|
ktorClientForSpace(CIO),
|
||||||
clientSecret = clientSecret ?: System.getProperty("space.clientSecret"),
|
SpaceAppInstance(
|
||||||
serverUrl = "https://mipt-npm.jetbrains.space"
|
clientId ?: System.getProperty("space.clientId"),
|
||||||
|
clientSecret ?: System.getProperty("space.clientSecret"),
|
||||||
|
spaceUrl
|
||||||
|
),
|
||||||
|
SpaceAuth.ClientCredentials()
|
||||||
)
|
)
|
||||||
|
|
||||||
if (pathValue.isDirectory()) {
|
coroutineScope {
|
||||||
space.processDirectory(client, spaceUrl, pathValue)
|
println("Processing project \"${space.projects.getProject(ProjectIdentifier.Key(project)).name}\"")
|
||||||
} else {
|
space.downloadAndProcessDocumentsInProject(
|
||||||
space.processDocument(client, spaceUrl, pathValue)
|
target,
|
||||||
|
ProjectIdentifier.Key(project),
|
||||||
|
folderId?.let { FolderIdentifier.Id(it) } ?: FolderIdentifier.Root
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
@ -1,7 +1,5 @@
|
|||||||
package ru.mipt.npm.space.documentextractor
|
package ru.mipt.npm.space.documentextractor
|
||||||
|
|
||||||
import io.ktor.client.HttpClient
|
|
||||||
import io.ktor.client.request.HttpRequestBuilder
|
|
||||||
import io.ktor.client.request.header
|
import io.ktor.client.request.header
|
||||||
import io.ktor.client.request.request
|
import io.ktor.client.request.request
|
||||||
import io.ktor.client.request.url
|
import io.ktor.client.request.url
|
||||||
@ -9,38 +7,59 @@ import io.ktor.client.statement.HttpResponse
|
|||||||
import io.ktor.client.statement.readBytes
|
import io.ktor.client.statement.readBytes
|
||||||
import io.ktor.http.HttpHeaders
|
import io.ktor.http.HttpHeaders
|
||||||
import io.ktor.http.HttpMethod
|
import io.ktor.http.HttpMethod
|
||||||
import kotlinx.coroutines.runBlocking
|
import io.ktor.utils.io.jvm.javaio.copyTo
|
||||||
|
import kotlinx.coroutines.Dispatchers
|
||||||
|
import kotlinx.coroutines.coroutineScope
|
||||||
|
import kotlinx.coroutines.launch
|
||||||
import org.slf4j.LoggerFactory
|
import org.slf4j.LoggerFactory
|
||||||
import space.jetbrains.api.runtime.SpaceHttpClientWithCallContext
|
import space.jetbrains.api.runtime.Batch
|
||||||
|
import space.jetbrains.api.runtime.SpaceClient
|
||||||
|
import space.jetbrains.api.runtime.resources.projects
|
||||||
|
import space.jetbrains.api.runtime.types.*
|
||||||
import java.nio.file.Files
|
import java.nio.file.Files
|
||||||
import java.nio.file.Path
|
import java.nio.file.Path
|
||||||
import kotlin.io.path.isDirectory
|
import kotlin.io.path.*
|
||||||
import kotlin.io.path.readText
|
|
||||||
import kotlin.io.path.writeBytes
|
|
||||||
import kotlin.io.path.writeText
|
|
||||||
import kotlin.streams.toList
|
import kotlin.streams.toList
|
||||||
|
|
||||||
fun SpaceHttpClientWithCallContext.extractImage(
|
private val logger by lazy { LoggerFactory.getLogger("space-extractor") }
|
||||||
client: HttpClient,
|
|
||||||
spaceUrl: String,
|
internal suspend fun SpaceClient.extractImage(
|
||||||
parent: Path,
|
parent: Path,
|
||||||
imageId: String,
|
imageId: String,
|
||||||
imageFileName: String,
|
imageFileName: String,
|
||||||
) = runBlocking {
|
) {
|
||||||
val request = HttpRequestBuilder().apply {
|
logger.info("Downloading image file $imageFileName to $parent")
|
||||||
val token = callContext.tokenSource.token()
|
val response = ktorClient.request<HttpResponse> {
|
||||||
url("$spaceUrl/d/$imageId")
|
url("${server.serverUrl}/d/$imageId")
|
||||||
method = HttpMethod.Get
|
method = HttpMethod.Get
|
||||||
header(HttpHeaders.Authorization, "Bearer ${token.accessToken}")
|
header(HttpHeaders.Authorization, "Bearer ${token().accessToken}")
|
||||||
}
|
}
|
||||||
val response = client.request<HttpResponse>(request)
|
|
||||||
val file = parent.resolve("images/$imageFileName")
|
val file = parent.resolve("images/$imageFileName")
|
||||||
|
file.parent.createDirectories()
|
||||||
file.writeBytes(response.readBytes())
|
file.writeBytes(response.readBytes())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
internal suspend fun SpaceClient.extractDocument(
|
||||||
|
parent: Path,
|
||||||
|
documentId: String,
|
||||||
|
documentFileName: String,
|
||||||
|
) {
|
||||||
|
//https://mipt-npm.jetbrains.space/drive/files/3qe9i43qtPq2
|
||||||
|
logger.info("Downloading document file $documentFileName to $parent")
|
||||||
|
val response = ktorClient.request<HttpResponse> {
|
||||||
|
url("${server.serverUrl}/drive/files/$documentId")
|
||||||
|
method = HttpMethod.Get
|
||||||
|
header(HttpHeaders.Authorization, "Bearer ${token().accessToken}")
|
||||||
|
}
|
||||||
|
val file = parent.resolve(documentFileName)
|
||||||
|
file.outputStream().use {
|
||||||
|
response.content.copyTo(it)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private val regex = """!\[(?<alt>.*)]\(/d/(?<id>.*)\?f=0""".toRegex()
|
private val regex = """!\[(?<alt>.*)]\(/d/(?<id>.*)\?f=0""".toRegex()
|
||||||
|
|
||||||
fun SpaceHttpClientWithCallContext.processDocument(client: HttpClient, spaceUrl: String, path: Path) {
|
internal suspend fun SpaceClient.processMarkdownDocument(path: Path) = coroutineScope{
|
||||||
val documentBody = path.readText()
|
val documentBody = path.readText()
|
||||||
val logger = LoggerFactory.getLogger("space-document-extractor")
|
val logger = LoggerFactory.getLogger("space-document-extractor")
|
||||||
logger.info("Processing file $path...")
|
logger.info("Processing file $path...")
|
||||||
@ -48,25 +67,87 @@ fun SpaceHttpClientWithCallContext.processDocument(client: HttpClient, spaceUrl:
|
|||||||
val id = it.groups["id"]?.value ?: error("Unexpected reference format: ${it.value}")
|
val id = it.groups["id"]?.value ?: error("Unexpected reference format: ${it.value}")
|
||||||
val alt = it.groups["alt"]?.value
|
val alt = it.groups["alt"]?.value
|
||||||
logger.info("Downloading image $id as images/$id")
|
logger.info("Downloading image $id as images/$id")
|
||||||
extractImage(client, spaceUrl, path.parent, id, id)
|
launch(Dispatchers.IO) {
|
||||||
|
extractImage(path.parent, id, id)
|
||||||
|
}
|
||||||
"![$alt](images/$id"
|
"![$alt](images/$id"
|
||||||
}
|
}
|
||||||
path.writeText(newText)
|
path.writeText(newText)
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fun SpaceHttpClientWithCallContext.processDirectory(
|
/**
|
||||||
client: HttpClient,
|
* Download images for markdown documents in the directory
|
||||||
spaceUrl: String,
|
*/
|
||||||
|
internal suspend fun SpaceClient.processMarkdownInDirectory(
|
||||||
path: Path,
|
path: Path,
|
||||||
fileExtension: String = ".md",
|
fileExtension: String = ".md",
|
||||||
recursive: Boolean = true,
|
recursive: Boolean = true,
|
||||||
) {
|
) {
|
||||||
Files.list(path).toList().forEach {
|
Files.list(path).toList().forEach {
|
||||||
if (it.toString().endsWith(fileExtension)) {
|
if (it.toString().endsWith(fileExtension)) {
|
||||||
processDocument(client, spaceUrl, it)
|
logger.info("Updating links in a markdown $it")
|
||||||
|
processMarkdownDocument(it)
|
||||||
} else if (recursive && it.isDirectory()) {
|
} else if (recursive && it.isDirectory()) {
|
||||||
processDirectory(client, spaceUrl, it, fileExtension)
|
processMarkdownInDirectory(it, fileExtension)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
internal suspend fun SpaceClient.downloadDocument(
|
||||||
|
directory: Path,
|
||||||
|
document: Document,
|
||||||
|
) = coroutineScope {
|
||||||
|
when (val body = document.documentBody) {
|
||||||
|
is FileDocumentBody -> {
|
||||||
|
launch(Dispatchers.IO) {
|
||||||
|
extractDocument(directory, document.id, document.title)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
is TextDocument -> {
|
||||||
|
val markdownFilePath = directory.resolve(document.title + ".md")
|
||||||
|
markdownFilePath.writeText(body.text, Charsets.UTF_8)
|
||||||
|
}
|
||||||
|
else -> {
|
||||||
|
LoggerFactory.getLogger("space-extractor")
|
||||||
|
.warn("Can't extract document ${document.title} with type ${document.bodyType}")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
internal suspend fun SpaceClient.downloadDocumentFolder(
|
||||||
|
directory: Path,
|
||||||
|
projectId: ProjectIdentifier,
|
||||||
|
folderId: FolderIdentifier,
|
||||||
|
) {
|
||||||
|
directory.createDirectories()
|
||||||
|
logger.info("Processing folder ${folderId.compactId} to $directory")
|
||||||
|
val documents = projects.documents.folders.documents.listDocumentsInFolder(projectId, folderId) {
|
||||||
|
id()
|
||||||
|
}
|
||||||
|
documents.data.forEach {
|
||||||
|
val document = projects.documents.getDocument(projectId, it.id) {
|
||||||
|
id()
|
||||||
|
title()
|
||||||
|
documentBody()
|
||||||
|
bodyType()
|
||||||
|
}
|
||||||
|
downloadDocument(directory, document)
|
||||||
|
}
|
||||||
|
|
||||||
|
val subFolders: Batch<DocumentFolder> = projects.documents.folders.subfolders.listSubfolders(projectId, folderId)
|
||||||
|
subFolders.data.forEach {
|
||||||
|
val subPath = directory.resolve(it.name)
|
||||||
|
downloadDocumentFolder(subPath, projectId, FolderIdentifier.Id(it.id))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
suspend fun SpaceClient.downloadAndProcessDocumentsInProject(
|
||||||
|
directory: Path,
|
||||||
|
projectId: ProjectIdentifier,
|
||||||
|
rootFolder: FolderIdentifier = FolderIdentifier.Root,
|
||||||
|
) {
|
||||||
|
logger.info("Processing project ${projectId.compactId} to $directory")
|
||||||
|
downloadDocumentFolder(directory, projectId, rootFolder)
|
||||||
|
processMarkdownInDirectory(directory)
|
||||||
}
|
}
|
Loading…
Reference in New Issue
Block a user