Add full chat extraction
This commit is contained in:
parent
c2a5ace4c4
commit
2734c18b11
@ -28,38 +28,33 @@ internal val logger by lazy { LoggerFactory.getLogger("space-extractor") }
|
||||
* Extract single attachment image
|
||||
*/
|
||||
internal suspend fun SpaceClient.extractImage(
|
||||
parent: Path,
|
||||
imageFile: Path,
|
||||
imageId: String,
|
||||
imageFileName: String,
|
||||
) {
|
||||
logger.info("Downloading image file $imageFileName to $parent")
|
||||
logger.info("Downloading image file to $imageFile")
|
||||
val response = ktorClient.request {
|
||||
url("${server.serverUrl}/d/$imageId")
|
||||
method = HttpMethod.Get
|
||||
header(HttpHeaders.Authorization, "Bearer ${token().accessToken}")
|
||||
}
|
||||
val file = parent.resolve("images/$imageFileName")
|
||||
file.parent.createDirectories()
|
||||
file.writeBytes(response.readBytes())
|
||||
imageFile.writeBytes(response.readBytes())
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract single file
|
||||
*/
|
||||
internal suspend fun SpaceClient.extractFile(
|
||||
parent: Path,
|
||||
documentFile: Path,
|
||||
documentId: String,
|
||||
documentFileName: String,
|
||||
) {
|
||||
//https://mipt-npm.jetbrains.space/drive/files/3qe9i43qtPq2
|
||||
logger.info("Downloading document file $documentFileName to $parent")
|
||||
logger.info("Downloading document file to $documentFile")
|
||||
val response = ktorClient.request {
|
||||
url("${server.serverUrl}/drive/files/$documentId")
|
||||
method = HttpMethod.Get
|
||||
header(HttpHeaders.Authorization, "Bearer ${token().accessToken}")
|
||||
}
|
||||
val file = parent.resolve(documentFileName)
|
||||
file.outputStream().use {
|
||||
documentFile.outputStream().use {
|
||||
response.bodyAsChannel().copyTo(it)
|
||||
}
|
||||
}
|
||||
@ -73,13 +68,16 @@ internal suspend fun SpaceClient.processMarkdownDocument(path: Path) = coroutine
|
||||
val documentBody = path.readText()
|
||||
val logger = LoggerFactory.getLogger("space-document-extractor")
|
||||
logger.info("Processing file $path...")
|
||||
val imageDirectory = path.parent.resolve("images")
|
||||
imageDirectory.createDirectories()
|
||||
val newText = documentBody.replace(regex) {
|
||||
val id = it.groups["id"]?.value ?: error("Unexpected reference format: ${it.value}")
|
||||
val alt = it.groups["alt"]?.value
|
||||
val imageName = alt?.let { "$id-$alt" } ?: id
|
||||
logger.info("Downloading image $id as images/$imageName")
|
||||
val imageFile = imageDirectory.resolve(imageName)
|
||||
logger.info("Downloading image $id as $imageFile")
|
||||
launch(Dispatchers.IO) {
|
||||
extractImage(path.parent, id, imageName)
|
||||
extractImage(imageFile, id)
|
||||
}
|
||||
"![$alt](images/$imageName"
|
||||
}
|
||||
@ -118,7 +116,7 @@ internal suspend fun SpaceClient.downloadDocument(
|
||||
when (val body = document.body) {
|
||||
is FileDocumentHttpBody -> {
|
||||
launch(Dispatchers.IO) {
|
||||
extractFile(directory, document.id, document.title)
|
||||
extractFile(directory.resolve(document.title), document.id)
|
||||
}
|
||||
}
|
||||
|
||||
@ -129,7 +127,7 @@ internal suspend fun SpaceClient.downloadDocument(
|
||||
markdownFilePath.writeText(content.markdown, Charsets.UTF_8)
|
||||
} else {
|
||||
launch(Dispatchers.IO) {
|
||||
extractFile(directory, document.id, document.title)
|
||||
extractFile(directory.resolve(document.title), document.id)
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -158,7 +156,6 @@ internal suspend fun SpaceClient.downloadDocumentFolder(
|
||||
val document = projects.documents.getDocument(projectId, it.id) {
|
||||
id()
|
||||
title()
|
||||
// documentBody()
|
||||
body()
|
||||
bodyType()
|
||||
}
|
||||
|
121
src/main/kotlin/extractMessages.kt
Normal file
121
src/main/kotlin/extractMessages.kt
Normal file
@ -0,0 +1,121 @@
|
||||
package center.sciprog.space.documentextractor
|
||||
|
||||
import kotlinx.datetime.Clock
|
||||
import kotlinx.datetime.Instant
|
||||
import space.jetbrains.api.runtime.SpaceClient
|
||||
import space.jetbrains.api.runtime.resources.chats
|
||||
import space.jetbrains.api.runtime.types.*
|
||||
import java.io.BufferedWriter
|
||||
import java.nio.file.Path
|
||||
import java.nio.file.StandardOpenOption
|
||||
import kotlin.io.path.createDirectories
|
||||
import kotlin.io.path.outputStream
|
||||
|
||||
private suspend fun SpaceClient.writeMessages(
|
||||
parentDirectory: Path,
|
||||
writer: BufferedWriter,
|
||||
id: ChannelIdentifier,
|
||||
prefix: String = "",
|
||||
) {
|
||||
var readDateTime: Instant? = Clock.System.now()
|
||||
var read: Int
|
||||
//reading messages in batches
|
||||
do {
|
||||
val result: GetMessagesResponse = chats.messages.getChannelMessages(
|
||||
channel = id,
|
||||
sorting = MessagesSorting.FromNewestToOldest,
|
||||
startFromDate = readDateTime,
|
||||
batchSize = 50
|
||||
) {
|
||||
nextStartFromDate()
|
||||
messages {
|
||||
author {
|
||||
name()
|
||||
}
|
||||
text()
|
||||
created()
|
||||
attachments()
|
||||
thread {
|
||||
content()
|
||||
}
|
||||
}
|
||||
}
|
||||
val attachmentsDirectory = parentDirectory.resolve("attachments")
|
||||
attachmentsDirectory.createDirectories()
|
||||
|
||||
result.messages.forEach { message ->
|
||||
writer.appendLine(
|
||||
"""
|
||||
|* **(${message.created}) ${message.author.name}:**
|
||||
|${message.text.trimIndent()}
|
||||
""".replaceIndentByMargin(prefix)
|
||||
)
|
||||
|
||||
message.attachments?.map { it.details }?.let { attachments ->
|
||||
attachments.forEach { attachment: Attachment? ->
|
||||
when (attachment) {
|
||||
is FileAttachment -> {
|
||||
val fileId = attachment.id
|
||||
val name = "${attachment.id}-${attachment.filename}"
|
||||
val file = attachmentsDirectory.resolve(name)
|
||||
extractFile(file, fileId)
|
||||
writer.appendLine("*Attachment*: [name](attachments/$name)\n")
|
||||
}
|
||||
|
||||
is ImageAttachment -> {
|
||||
val fileId = attachment.id
|
||||
val name = attachment.name?.let { "${attachment.id}-${attachment.name}"} ?: fileId
|
||||
val file = attachmentsDirectory.resolve(name)
|
||||
extractFile(file, fileId)
|
||||
writer.appendLine("*Attachment*: [$name](attachments/$name)\n")
|
||||
}
|
||||
|
||||
is VideoAttachment -> {
|
||||
val fileId = attachment.id
|
||||
val name = attachment.name?.let { "${attachment.id}-${attachment.name}"} ?: fileId
|
||||
val file = attachmentsDirectory.resolve(name)
|
||||
extractFile(file, fileId)
|
||||
writer.appendLine("*Attachment*: [$name](attachments/$name)\n")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
message.thread?.content?.let {
|
||||
if (it is M2ChannelContentThread) {
|
||||
writeMessages(parentDirectory, writer, ChannelIdentifier.Thread(it.record.id), " ")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
read = result.messages.count()
|
||||
readDateTime = result.nextStartFromDate
|
||||
|
||||
} while (read == 50)
|
||||
}
|
||||
|
||||
|
||||
suspend fun SpaceClient.extractMessages(
|
||||
chatId: String,
|
||||
parentDirectory: Path,
|
||||
) {
|
||||
val id = ChannelIdentifier.Id(chatId)
|
||||
val channel = chats.channels.getChannel(id)
|
||||
|
||||
val name = (channel.contact.ext as? M2SharedChannelContent)?.name ?: channel.contact.defaultName
|
||||
|
||||
val file = parentDirectory.resolve("$name.md")
|
||||
|
||||
file.parent.createDirectories()
|
||||
|
||||
file.outputStream(
|
||||
StandardOpenOption.CREATE,
|
||||
StandardOpenOption.TRUNCATE_EXISTING,
|
||||
StandardOpenOption.WRITE
|
||||
).bufferedWriter().use { out ->
|
||||
// out.append(M2ChannelRecordStructure.serialize(channel).toPrettyString())
|
||||
// out.appendLine("</hr>")
|
||||
// out.appendLine()
|
||||
writeMessages(parentDirectory, out, id)
|
||||
}
|
||||
}
|
@ -4,9 +4,10 @@ import space.jetbrains.api.runtime.SpaceClient
|
||||
import space.jetbrains.api.runtime.resources.projects
|
||||
import space.jetbrains.api.runtime.types.ProjectIdentifier
|
||||
import java.nio.file.Path
|
||||
import kotlin.io.path.createDirectories
|
||||
import kotlin.io.path.div
|
||||
|
||||
/**
|
||||
* Clone a single repository to a [parentDirectory]
|
||||
*/
|
||||
private fun cloneRepo(
|
||||
parentDirectory: Path,
|
||||
url: String,
|
||||
@ -25,8 +26,11 @@ private fun cloneRepo(
|
||||
// .call()
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract all repos in the project into a [parentDirectory]
|
||||
*/
|
||||
suspend fun SpaceClient.extractRepos(
|
||||
directory: Path,
|
||||
parentDirectory: Path,
|
||||
projectId: ProjectIdentifier,
|
||||
) {
|
||||
val repos = projects.getProject(
|
||||
@ -41,7 +45,7 @@ suspend fun SpaceClient.extractRepos(
|
||||
project = projectId,
|
||||
repository = repo.name
|
||||
).sshUrl ?: error("Could not resolve sshUrl for ${repo.name}")
|
||||
cloneRepo(parentDirectory = directory, url)
|
||||
cloneRepo(parentDirectory = parentDirectory, url)
|
||||
} catch (ex: Exception) {
|
||||
logger.error("Failed ", ex)
|
||||
}
|
||||
|
@ -170,7 +170,44 @@ private class ExtractRepositoriesCommand : ExtractCommand("repos", "Extract repo
|
||||
|
||||
}
|
||||
|
||||
private class ExtractAllCommand : ExtractCommand("all", "Extract all data from a project") {
|
||||
private class ExtractChatsCommand: ExtractCommand("chat","Extract all messages from a chat"){
|
||||
|
||||
val path: String by option(
|
||||
ArgType.String,
|
||||
description = "Target directory. Default is './chats'."
|
||||
).default("./chats")
|
||||
|
||||
override fun execute() {
|
||||
val urlMatch = urlRegex.matchEntire(url) ?: error("Url $url does not match space document url pattern")
|
||||
|
||||
val spaceUrl = urlMatch.groups["spaceUrl"]?.value ?: error("Space Url token not recognized")
|
||||
|
||||
val chatId = urlMatch.groups["chatId"]?.value ?: error("Chat id token not recognized")
|
||||
|
||||
val appInstance = SpaceAppInstance(
|
||||
clientId ?: System.getProperty("space.clientId"),
|
||||
clientSecret ?: System.getProperty("space.clientSecret"),
|
||||
spaceUrl
|
||||
)
|
||||
|
||||
val spaceClient: SpaceClient = SpaceClient(
|
||||
ktorClientForSpace(CIO),
|
||||
appInstance,
|
||||
SpaceAuth.ClientCredentials()
|
||||
)
|
||||
|
||||
runBlocking {
|
||||
spaceClient.extractMessages(chatId, Path(path))
|
||||
}
|
||||
}
|
||||
|
||||
companion object {
|
||||
private val urlRegex =
|
||||
"""(?<spaceUrl>https?:\/\/[^\/]*)\/im\/group\/(?<chatId>.*)""".toRegex()
|
||||
}
|
||||
}
|
||||
|
||||
private class ExtractProjectCommand : ExtractCommand("project", "Extract all data from a project") {
|
||||
|
||||
val path: String by option(
|
||||
ArgType.String,
|
||||
@ -235,7 +272,7 @@ private class ExtractAllCommand : ExtractCommand("all", "Extract all data from a
|
||||
fun main(args: Array<String>) {
|
||||
val parser = ArgParser("space-export")
|
||||
|
||||
parser.subcommands(ExtractDocumentsCommand(), ExtractRepositoriesCommand(), ExtractAllCommand())
|
||||
parser.subcommands(ExtractDocumentsCommand(), ExtractRepositoriesCommand(), ExtractProjectCommand(), ExtractChatsCommand())
|
||||
|
||||
parser.parse(args)
|
||||
}
|
Loading…
Reference in New Issue
Block a user