Add full chat extraction
This commit is contained in:
parent
c2a5ace4c4
commit
2734c18b11
@ -28,38 +28,33 @@ internal val logger by lazy { LoggerFactory.getLogger("space-extractor") }
|
|||||||
* Extract single attachment image
|
* Extract single attachment image
|
||||||
*/
|
*/
|
||||||
internal suspend fun SpaceClient.extractImage(
|
internal suspend fun SpaceClient.extractImage(
|
||||||
parent: Path,
|
imageFile: Path,
|
||||||
imageId: String,
|
imageId: String,
|
||||||
imageFileName: String,
|
|
||||||
) {
|
) {
|
||||||
logger.info("Downloading image file $imageFileName to $parent")
|
logger.info("Downloading image file to $imageFile")
|
||||||
val response = ktorClient.request {
|
val response = ktorClient.request {
|
||||||
url("${server.serverUrl}/d/$imageId")
|
url("${server.serverUrl}/d/$imageId")
|
||||||
method = HttpMethod.Get
|
method = HttpMethod.Get
|
||||||
header(HttpHeaders.Authorization, "Bearer ${token().accessToken}")
|
header(HttpHeaders.Authorization, "Bearer ${token().accessToken}")
|
||||||
}
|
}
|
||||||
val file = parent.resolve("images/$imageFileName")
|
imageFile.writeBytes(response.readBytes())
|
||||||
file.parent.createDirectories()
|
|
||||||
file.writeBytes(response.readBytes())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Extract single file
|
* Extract single file
|
||||||
*/
|
*/
|
||||||
internal suspend fun SpaceClient.extractFile(
|
internal suspend fun SpaceClient.extractFile(
|
||||||
parent: Path,
|
documentFile: Path,
|
||||||
documentId: String,
|
documentId: String,
|
||||||
documentFileName: String,
|
|
||||||
) {
|
) {
|
||||||
//https://mipt-npm.jetbrains.space/drive/files/3qe9i43qtPq2
|
//https://mipt-npm.jetbrains.space/drive/files/3qe9i43qtPq2
|
||||||
logger.info("Downloading document file $documentFileName to $parent")
|
logger.info("Downloading document file to $documentFile")
|
||||||
val response = ktorClient.request {
|
val response = ktorClient.request {
|
||||||
url("${server.serverUrl}/drive/files/$documentId")
|
url("${server.serverUrl}/drive/files/$documentId")
|
||||||
method = HttpMethod.Get
|
method = HttpMethod.Get
|
||||||
header(HttpHeaders.Authorization, "Bearer ${token().accessToken}")
|
header(HttpHeaders.Authorization, "Bearer ${token().accessToken}")
|
||||||
}
|
}
|
||||||
val file = parent.resolve(documentFileName)
|
documentFile.outputStream().use {
|
||||||
file.outputStream().use {
|
|
||||||
response.bodyAsChannel().copyTo(it)
|
response.bodyAsChannel().copyTo(it)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -73,13 +68,16 @@ internal suspend fun SpaceClient.processMarkdownDocument(path: Path) = coroutine
|
|||||||
val documentBody = path.readText()
|
val documentBody = path.readText()
|
||||||
val logger = LoggerFactory.getLogger("space-document-extractor")
|
val logger = LoggerFactory.getLogger("space-document-extractor")
|
||||||
logger.info("Processing file $path...")
|
logger.info("Processing file $path...")
|
||||||
|
val imageDirectory = path.parent.resolve("images")
|
||||||
|
imageDirectory.createDirectories()
|
||||||
val newText = documentBody.replace(regex) {
|
val newText = documentBody.replace(regex) {
|
||||||
val id = it.groups["id"]?.value ?: error("Unexpected reference format: ${it.value}")
|
val id = it.groups["id"]?.value ?: error("Unexpected reference format: ${it.value}")
|
||||||
val alt = it.groups["alt"]?.value
|
val alt = it.groups["alt"]?.value
|
||||||
val imageName = alt?.let { "$id-$alt" } ?: id
|
val imageName = alt?.let { "$id-$alt" } ?: id
|
||||||
logger.info("Downloading image $id as images/$imageName")
|
val imageFile = imageDirectory.resolve(imageName)
|
||||||
|
logger.info("Downloading image $id as $imageFile")
|
||||||
launch(Dispatchers.IO) {
|
launch(Dispatchers.IO) {
|
||||||
extractImage(path.parent, id, imageName)
|
extractImage(imageFile, id)
|
||||||
}
|
}
|
||||||
"![$alt](images/$imageName"
|
"![$alt](images/$imageName"
|
||||||
}
|
}
|
||||||
@ -118,7 +116,7 @@ internal suspend fun SpaceClient.downloadDocument(
|
|||||||
when (val body = document.body) {
|
when (val body = document.body) {
|
||||||
is FileDocumentHttpBody -> {
|
is FileDocumentHttpBody -> {
|
||||||
launch(Dispatchers.IO) {
|
launch(Dispatchers.IO) {
|
||||||
extractFile(directory, document.id, document.title)
|
extractFile(directory.resolve(document.title), document.id)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -129,7 +127,7 @@ internal suspend fun SpaceClient.downloadDocument(
|
|||||||
markdownFilePath.writeText(content.markdown, Charsets.UTF_8)
|
markdownFilePath.writeText(content.markdown, Charsets.UTF_8)
|
||||||
} else {
|
} else {
|
||||||
launch(Dispatchers.IO) {
|
launch(Dispatchers.IO) {
|
||||||
extractFile(directory, document.id, document.title)
|
extractFile(directory.resolve(document.title), document.id)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -158,7 +156,6 @@ internal suspend fun SpaceClient.downloadDocumentFolder(
|
|||||||
val document = projects.documents.getDocument(projectId, it.id) {
|
val document = projects.documents.getDocument(projectId, it.id) {
|
||||||
id()
|
id()
|
||||||
title()
|
title()
|
||||||
// documentBody()
|
|
||||||
body()
|
body()
|
||||||
bodyType()
|
bodyType()
|
||||||
}
|
}
|
||||||
|
121
src/main/kotlin/extractMessages.kt
Normal file
121
src/main/kotlin/extractMessages.kt
Normal file
@ -0,0 +1,121 @@
|
|||||||
|
package center.sciprog.space.documentextractor
|
||||||
|
|
||||||
|
import kotlinx.datetime.Clock
|
||||||
|
import kotlinx.datetime.Instant
|
||||||
|
import space.jetbrains.api.runtime.SpaceClient
|
||||||
|
import space.jetbrains.api.runtime.resources.chats
|
||||||
|
import space.jetbrains.api.runtime.types.*
|
||||||
|
import java.io.BufferedWriter
|
||||||
|
import java.nio.file.Path
|
||||||
|
import java.nio.file.StandardOpenOption
|
||||||
|
import kotlin.io.path.createDirectories
|
||||||
|
import kotlin.io.path.outputStream
|
||||||
|
|
||||||
|
private suspend fun SpaceClient.writeMessages(
|
||||||
|
parentDirectory: Path,
|
||||||
|
writer: BufferedWriter,
|
||||||
|
id: ChannelIdentifier,
|
||||||
|
prefix: String = "",
|
||||||
|
) {
|
||||||
|
var readDateTime: Instant? = Clock.System.now()
|
||||||
|
var read: Int
|
||||||
|
//reading messages in batches
|
||||||
|
do {
|
||||||
|
val result: GetMessagesResponse = chats.messages.getChannelMessages(
|
||||||
|
channel = id,
|
||||||
|
sorting = MessagesSorting.FromNewestToOldest,
|
||||||
|
startFromDate = readDateTime,
|
||||||
|
batchSize = 50
|
||||||
|
) {
|
||||||
|
nextStartFromDate()
|
||||||
|
messages {
|
||||||
|
author {
|
||||||
|
name()
|
||||||
|
}
|
||||||
|
text()
|
||||||
|
created()
|
||||||
|
attachments()
|
||||||
|
thread {
|
||||||
|
content()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
val attachmentsDirectory = parentDirectory.resolve("attachments")
|
||||||
|
attachmentsDirectory.createDirectories()
|
||||||
|
|
||||||
|
result.messages.forEach { message ->
|
||||||
|
writer.appendLine(
|
||||||
|
"""
|
||||||
|
|* **(${message.created}) ${message.author.name}:**
|
||||||
|
|${message.text.trimIndent()}
|
||||||
|
""".replaceIndentByMargin(prefix)
|
||||||
|
)
|
||||||
|
|
||||||
|
message.attachments?.map { it.details }?.let { attachments ->
|
||||||
|
attachments.forEach { attachment: Attachment? ->
|
||||||
|
when (attachment) {
|
||||||
|
is FileAttachment -> {
|
||||||
|
val fileId = attachment.id
|
||||||
|
val name = "${attachment.id}-${attachment.filename}"
|
||||||
|
val file = attachmentsDirectory.resolve(name)
|
||||||
|
extractFile(file, fileId)
|
||||||
|
writer.appendLine("*Attachment*: [name](attachments/$name)\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
is ImageAttachment -> {
|
||||||
|
val fileId = attachment.id
|
||||||
|
val name = attachment.name?.let { "${attachment.id}-${attachment.name}"} ?: fileId
|
||||||
|
val file = attachmentsDirectory.resolve(name)
|
||||||
|
extractFile(file, fileId)
|
||||||
|
writer.appendLine("*Attachment*: [$name](attachments/$name)\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
is VideoAttachment -> {
|
||||||
|
val fileId = attachment.id
|
||||||
|
val name = attachment.name?.let { "${attachment.id}-${attachment.name}"} ?: fileId
|
||||||
|
val file = attachmentsDirectory.resolve(name)
|
||||||
|
extractFile(file, fileId)
|
||||||
|
writer.appendLine("*Attachment*: [$name](attachments/$name)\n")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
message.thread?.content?.let {
|
||||||
|
if (it is M2ChannelContentThread) {
|
||||||
|
writeMessages(parentDirectory, writer, ChannelIdentifier.Thread(it.record.id), " ")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
read = result.messages.count()
|
||||||
|
readDateTime = result.nextStartFromDate
|
||||||
|
|
||||||
|
} while (read == 50)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
suspend fun SpaceClient.extractMessages(
|
||||||
|
chatId: String,
|
||||||
|
parentDirectory: Path,
|
||||||
|
) {
|
||||||
|
val id = ChannelIdentifier.Id(chatId)
|
||||||
|
val channel = chats.channels.getChannel(id)
|
||||||
|
|
||||||
|
val name = (channel.contact.ext as? M2SharedChannelContent)?.name ?: channel.contact.defaultName
|
||||||
|
|
||||||
|
val file = parentDirectory.resolve("$name.md")
|
||||||
|
|
||||||
|
file.parent.createDirectories()
|
||||||
|
|
||||||
|
file.outputStream(
|
||||||
|
StandardOpenOption.CREATE,
|
||||||
|
StandardOpenOption.TRUNCATE_EXISTING,
|
||||||
|
StandardOpenOption.WRITE
|
||||||
|
).bufferedWriter().use { out ->
|
||||||
|
// out.append(M2ChannelRecordStructure.serialize(channel).toPrettyString())
|
||||||
|
// out.appendLine("</hr>")
|
||||||
|
// out.appendLine()
|
||||||
|
writeMessages(parentDirectory, out, id)
|
||||||
|
}
|
||||||
|
}
|
@ -4,9 +4,10 @@ import space.jetbrains.api.runtime.SpaceClient
|
|||||||
import space.jetbrains.api.runtime.resources.projects
|
import space.jetbrains.api.runtime.resources.projects
|
||||||
import space.jetbrains.api.runtime.types.ProjectIdentifier
|
import space.jetbrains.api.runtime.types.ProjectIdentifier
|
||||||
import java.nio.file.Path
|
import java.nio.file.Path
|
||||||
import kotlin.io.path.createDirectories
|
|
||||||
import kotlin.io.path.div
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Clone a single repository to a [parentDirectory]
|
||||||
|
*/
|
||||||
private fun cloneRepo(
|
private fun cloneRepo(
|
||||||
parentDirectory: Path,
|
parentDirectory: Path,
|
||||||
url: String,
|
url: String,
|
||||||
@ -25,8 +26,11 @@ private fun cloneRepo(
|
|||||||
// .call()
|
// .call()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract all repos in the project into a [parentDirectory]
|
||||||
|
*/
|
||||||
suspend fun SpaceClient.extractRepos(
|
suspend fun SpaceClient.extractRepos(
|
||||||
directory: Path,
|
parentDirectory: Path,
|
||||||
projectId: ProjectIdentifier,
|
projectId: ProjectIdentifier,
|
||||||
) {
|
) {
|
||||||
val repos = projects.getProject(
|
val repos = projects.getProject(
|
||||||
@ -41,7 +45,7 @@ suspend fun SpaceClient.extractRepos(
|
|||||||
project = projectId,
|
project = projectId,
|
||||||
repository = repo.name
|
repository = repo.name
|
||||||
).sshUrl ?: error("Could not resolve sshUrl for ${repo.name}")
|
).sshUrl ?: error("Could not resolve sshUrl for ${repo.name}")
|
||||||
cloneRepo(parentDirectory = directory, url)
|
cloneRepo(parentDirectory = parentDirectory, url)
|
||||||
} catch (ex: Exception) {
|
} catch (ex: Exception) {
|
||||||
logger.error("Failed ", ex)
|
logger.error("Failed ", ex)
|
||||||
}
|
}
|
||||||
|
@ -170,7 +170,44 @@ private class ExtractRepositoriesCommand : ExtractCommand("repos", "Extract repo
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private class ExtractAllCommand : ExtractCommand("all", "Extract all data from a project") {
|
private class ExtractChatsCommand: ExtractCommand("chat","Extract all messages from a chat"){
|
||||||
|
|
||||||
|
val path: String by option(
|
||||||
|
ArgType.String,
|
||||||
|
description = "Target directory. Default is './chats'."
|
||||||
|
).default("./chats")
|
||||||
|
|
||||||
|
override fun execute() {
|
||||||
|
val urlMatch = urlRegex.matchEntire(url) ?: error("Url $url does not match space document url pattern")
|
||||||
|
|
||||||
|
val spaceUrl = urlMatch.groups["spaceUrl"]?.value ?: error("Space Url token not recognized")
|
||||||
|
|
||||||
|
val chatId = urlMatch.groups["chatId"]?.value ?: error("Chat id token not recognized")
|
||||||
|
|
||||||
|
val appInstance = SpaceAppInstance(
|
||||||
|
clientId ?: System.getProperty("space.clientId"),
|
||||||
|
clientSecret ?: System.getProperty("space.clientSecret"),
|
||||||
|
spaceUrl
|
||||||
|
)
|
||||||
|
|
||||||
|
val spaceClient: SpaceClient = SpaceClient(
|
||||||
|
ktorClientForSpace(CIO),
|
||||||
|
appInstance,
|
||||||
|
SpaceAuth.ClientCredentials()
|
||||||
|
)
|
||||||
|
|
||||||
|
runBlocking {
|
||||||
|
spaceClient.extractMessages(chatId, Path(path))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
companion object {
|
||||||
|
private val urlRegex =
|
||||||
|
"""(?<spaceUrl>https?:\/\/[^\/]*)\/im\/group\/(?<chatId>.*)""".toRegex()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private class ExtractProjectCommand : ExtractCommand("project", "Extract all data from a project") {
|
||||||
|
|
||||||
val path: String by option(
|
val path: String by option(
|
||||||
ArgType.String,
|
ArgType.String,
|
||||||
@ -235,7 +272,7 @@ private class ExtractAllCommand : ExtractCommand("all", "Extract all data from a
|
|||||||
fun main(args: Array<String>) {
|
fun main(args: Array<String>) {
|
||||||
val parser = ArgParser("space-export")
|
val parser = ArgParser("space-export")
|
||||||
|
|
||||||
parser.subcommands(ExtractDocumentsCommand(), ExtractRepositoriesCommand(), ExtractAllCommand())
|
parser.subcommands(ExtractDocumentsCommand(), ExtractRepositoriesCommand(), ExtractProjectCommand(), ExtractChatsCommand())
|
||||||
|
|
||||||
parser.parse(args)
|
parser.parse(args)
|
||||||
}
|
}
|
Loading…
Reference in New Issue
Block a user