Add full chat extraction

This commit is contained in:
Alexander Nozik 2023-08-27 18:41:43 +03:00
parent c2a5ace4c4
commit 2734c18b11
4 changed files with 181 additions and 22 deletions

View File

@ -28,38 +28,33 @@ internal val logger by lazy { LoggerFactory.getLogger("space-extractor") }
* Extract single attachment image
*/
internal suspend fun SpaceClient.extractImage(
parent: Path,
imageFile: Path,
imageId: String,
imageFileName: String,
) {
logger.info("Downloading image file $imageFileName to $parent")
logger.info("Downloading image file to $imageFile")
val response = ktorClient.request {
url("${server.serverUrl}/d/$imageId")
method = HttpMethod.Get
header(HttpHeaders.Authorization, "Bearer ${token().accessToken}")
}
val file = parent.resolve("images/$imageFileName")
file.parent.createDirectories()
file.writeBytes(response.readBytes())
imageFile.writeBytes(response.readBytes())
}
/**
* Extract single file
*/
internal suspend fun SpaceClient.extractFile(
parent: Path,
documentFile: Path,
documentId: String,
documentFileName: String,
) {
//https://mipt-npm.jetbrains.space/drive/files/3qe9i43qtPq2
logger.info("Downloading document file $documentFileName to $parent")
logger.info("Downloading document file to $documentFile")
val response = ktorClient.request {
url("${server.serverUrl}/drive/files/$documentId")
method = HttpMethod.Get
header(HttpHeaders.Authorization, "Bearer ${token().accessToken}")
}
val file = parent.resolve(documentFileName)
file.outputStream().use {
documentFile.outputStream().use {
response.bodyAsChannel().copyTo(it)
}
}
@ -73,13 +68,16 @@ internal suspend fun SpaceClient.processMarkdownDocument(path: Path) = coroutine
val documentBody = path.readText()
val logger = LoggerFactory.getLogger("space-document-extractor")
logger.info("Processing file $path...")
val imageDirectory = path.parent.resolve("images")
imageDirectory.createDirectories()
val newText = documentBody.replace(regex) {
val id = it.groups["id"]?.value ?: error("Unexpected reference format: ${it.value}")
val alt = it.groups["alt"]?.value
val imageName = alt?.let { "$id-$alt" } ?: id
logger.info("Downloading image $id as images/$imageName")
val imageFile = imageDirectory.resolve(imageName)
logger.info("Downloading image $id as $imageFile")
launch(Dispatchers.IO) {
extractImage(path.parent, id, imageName)
extractImage(imageFile, id)
}
"![$alt](images/$imageName"
}
@ -118,7 +116,7 @@ internal suspend fun SpaceClient.downloadDocument(
when (val body = document.body) {
is FileDocumentHttpBody -> {
launch(Dispatchers.IO) {
extractFile(directory, document.id, document.title)
extractFile(directory.resolve(document.title), document.id)
}
}
@ -129,7 +127,7 @@ internal suspend fun SpaceClient.downloadDocument(
markdownFilePath.writeText(content.markdown, Charsets.UTF_8)
} else {
launch(Dispatchers.IO) {
extractFile(directory, document.id, document.title)
extractFile(directory.resolve(document.title), document.id)
}
}
}
@ -158,7 +156,6 @@ internal suspend fun SpaceClient.downloadDocumentFolder(
val document = projects.documents.getDocument(projectId, it.id) {
id()
title()
// documentBody()
body()
bodyType()
}

View File

@ -0,0 +1,121 @@
package center.sciprog.space.documentextractor
import kotlinx.datetime.Clock
import kotlinx.datetime.Instant
import space.jetbrains.api.runtime.SpaceClient
import space.jetbrains.api.runtime.resources.chats
import space.jetbrains.api.runtime.types.*
import java.io.BufferedWriter
import java.nio.file.Path
import java.nio.file.StandardOpenOption
import kotlin.io.path.createDirectories
import kotlin.io.path.outputStream
private suspend fun SpaceClient.writeMessages(
parentDirectory: Path,
writer: BufferedWriter,
id: ChannelIdentifier,
prefix: String = "",
) {
var readDateTime: Instant? = Clock.System.now()
var read: Int
//reading messages in batches
do {
val result: GetMessagesResponse = chats.messages.getChannelMessages(
channel = id,
sorting = MessagesSorting.FromNewestToOldest,
startFromDate = readDateTime,
batchSize = 50
) {
nextStartFromDate()
messages {
author {
name()
}
text()
created()
attachments()
thread {
content()
}
}
}
val attachmentsDirectory = parentDirectory.resolve("attachments")
attachmentsDirectory.createDirectories()
result.messages.forEach { message ->
writer.appendLine(
"""
|* **(${message.created}) ${message.author.name}:**
|${message.text.trimIndent()}
""".replaceIndentByMargin(prefix)
)
message.attachments?.map { it.details }?.let { attachments ->
attachments.forEach { attachment: Attachment? ->
when (attachment) {
is FileAttachment -> {
val fileId = attachment.id
val name = "${attachment.id}-${attachment.filename}"
val file = attachmentsDirectory.resolve(name)
extractFile(file, fileId)
writer.appendLine("*Attachment*: [name](attachments/$name)\n")
}
is ImageAttachment -> {
val fileId = attachment.id
val name = attachment.name?.let { "${attachment.id}-${attachment.name}"} ?: fileId
val file = attachmentsDirectory.resolve(name)
extractFile(file, fileId)
writer.appendLine("*Attachment*: [$name](attachments/$name)\n")
}
is VideoAttachment -> {
val fileId = attachment.id
val name = attachment.name?.let { "${attachment.id}-${attachment.name}"} ?: fileId
val file = attachmentsDirectory.resolve(name)
extractFile(file, fileId)
writer.appendLine("*Attachment*: [$name](attachments/$name)\n")
}
}
}
}
message.thread?.content?.let {
if (it is M2ChannelContentThread) {
writeMessages(parentDirectory, writer, ChannelIdentifier.Thread(it.record.id), " ")
}
}
}
read = result.messages.count()
readDateTime = result.nextStartFromDate
} while (read == 50)
}
suspend fun SpaceClient.extractMessages(
chatId: String,
parentDirectory: Path,
) {
val id = ChannelIdentifier.Id(chatId)
val channel = chats.channels.getChannel(id)
val name = (channel.contact.ext as? M2SharedChannelContent)?.name ?: channel.contact.defaultName
val file = parentDirectory.resolve("$name.md")
file.parent.createDirectories()
file.outputStream(
StandardOpenOption.CREATE,
StandardOpenOption.TRUNCATE_EXISTING,
StandardOpenOption.WRITE
).bufferedWriter().use { out ->
// out.append(M2ChannelRecordStructure.serialize(channel).toPrettyString())
// out.appendLine("</hr>")
// out.appendLine()
writeMessages(parentDirectory, out, id)
}
}

View File

@ -4,9 +4,10 @@ import space.jetbrains.api.runtime.SpaceClient
import space.jetbrains.api.runtime.resources.projects
import space.jetbrains.api.runtime.types.ProjectIdentifier
import java.nio.file.Path
import kotlin.io.path.createDirectories
import kotlin.io.path.div
/**
* Clone a single repository to a [parentDirectory]
*/
private fun cloneRepo(
parentDirectory: Path,
url: String,
@ -25,8 +26,11 @@ private fun cloneRepo(
// .call()
}
/**
* Extract all repos in the project into a [parentDirectory]
*/
suspend fun SpaceClient.extractRepos(
directory: Path,
parentDirectory: Path,
projectId: ProjectIdentifier,
) {
val repos = projects.getProject(
@ -41,7 +45,7 @@ suspend fun SpaceClient.extractRepos(
project = projectId,
repository = repo.name
).sshUrl ?: error("Could not resolve sshUrl for ${repo.name}")
cloneRepo(parentDirectory = directory, url)
cloneRepo(parentDirectory = parentDirectory, url)
} catch (ex: Exception) {
logger.error("Failed ", ex)
}

View File

@ -170,7 +170,44 @@ private class ExtractRepositoriesCommand : ExtractCommand("repos", "Extract repo
}
private class ExtractAllCommand : ExtractCommand("all", "Extract all data from a project") {
private class ExtractChatsCommand: ExtractCommand("chat","Extract all messages from a chat"){
val path: String by option(
ArgType.String,
description = "Target directory. Default is './chats'."
).default("./chats")
override fun execute() {
val urlMatch = urlRegex.matchEntire(url) ?: error("Url $url does not match space document url pattern")
val spaceUrl = urlMatch.groups["spaceUrl"]?.value ?: error("Space Url token not recognized")
val chatId = urlMatch.groups["chatId"]?.value ?: error("Chat id token not recognized")
val appInstance = SpaceAppInstance(
clientId ?: System.getProperty("space.clientId"),
clientSecret ?: System.getProperty("space.clientSecret"),
spaceUrl
)
val spaceClient: SpaceClient = SpaceClient(
ktorClientForSpace(CIO),
appInstance,
SpaceAuth.ClientCredentials()
)
runBlocking {
spaceClient.extractMessages(chatId, Path(path))
}
}
companion object {
private val urlRegex =
"""(?<spaceUrl>https?:\/\/[^\/]*)\/im\/group\/(?<chatId>.*)""".toRegex()
}
}
private class ExtractProjectCommand : ExtractCommand("project", "Extract all data from a project") {
val path: String by option(
ArgType.String,
@ -235,7 +272,7 @@ private class ExtractAllCommand : ExtractCommand("all", "Extract all data from a
fun main(args: Array<String>) {
val parser = ArgParser("space-export")
parser.subcommands(ExtractDocumentsCommand(), ExtractRepositoriesCommand(), ExtractAllCommand())
parser.subcommands(ExtractDocumentsCommand(), ExtractRepositoriesCommand(), ExtractProjectCommand(), ExtractChatsCommand())
parser.parse(args)
}