diff --git a/src/main/kotlin/extractDocuments.kt b/src/main/kotlin/extractDocuments.kt index 0c16530..cb496f8 100644 --- a/src/main/kotlin/extractDocuments.kt +++ b/src/main/kotlin/extractDocuments.kt @@ -28,38 +28,33 @@ internal val logger by lazy { LoggerFactory.getLogger("space-extractor") } * Extract single attachment image */ internal suspend fun SpaceClient.extractImage( - parent: Path, + imageFile: Path, imageId: String, - imageFileName: String, ) { - logger.info("Downloading image file $imageFileName to $parent") + logger.info("Downloading image file to $imageFile") val response = ktorClient.request { url("${server.serverUrl}/d/$imageId") method = HttpMethod.Get header(HttpHeaders.Authorization, "Bearer ${token().accessToken}") } - val file = parent.resolve("images/$imageFileName") - file.parent.createDirectories() - file.writeBytes(response.readBytes()) + imageFile.writeBytes(response.readBytes()) } /** * Extract single file */ internal suspend fun SpaceClient.extractFile( - parent: Path, + documentFile: Path, documentId: String, - documentFileName: String, ) { //https://mipt-npm.jetbrains.space/drive/files/3qe9i43qtPq2 - logger.info("Downloading document file $documentFileName to $parent") + logger.info("Downloading document file to $documentFile") val response = ktorClient.request { url("${server.serverUrl}/drive/files/$documentId") method = HttpMethod.Get header(HttpHeaders.Authorization, "Bearer ${token().accessToken}") } - val file = parent.resolve(documentFileName) - file.outputStream().use { + documentFile.outputStream().use { response.bodyAsChannel().copyTo(it) } } @@ -73,13 +68,16 @@ internal suspend fun SpaceClient.processMarkdownDocument(path: Path) = coroutine val documentBody = path.readText() val logger = LoggerFactory.getLogger("space-document-extractor") logger.info("Processing file $path...") + val imageDirectory = path.parent.resolve("images") + imageDirectory.createDirectories() val newText = documentBody.replace(regex) { val id = it.groups["id"]?.value ?: error("Unexpected reference format: ${it.value}") val alt = it.groups["alt"]?.value val imageName = alt?.let { "$id-$alt" } ?: id - logger.info("Downloading image $id as images/$imageName") + val imageFile = imageDirectory.resolve(imageName) + logger.info("Downloading image $id as $imageFile") launch(Dispatchers.IO) { - extractImage(path.parent, id, imageName) + extractImage(imageFile, id) } "![$alt](images/$imageName" } @@ -118,7 +116,7 @@ internal suspend fun SpaceClient.downloadDocument( when (val body = document.body) { is FileDocumentHttpBody -> { launch(Dispatchers.IO) { - extractFile(directory, document.id, document.title) + extractFile(directory.resolve(document.title), document.id) } } @@ -129,7 +127,7 @@ internal suspend fun SpaceClient.downloadDocument( markdownFilePath.writeText(content.markdown, Charsets.UTF_8) } else { launch(Dispatchers.IO) { - extractFile(directory, document.id, document.title) + extractFile(directory.resolve(document.title), document.id) } } } @@ -158,7 +156,6 @@ internal suspend fun SpaceClient.downloadDocumentFolder( val document = projects.documents.getDocument(projectId, it.id) { id() title() -// documentBody() body() bodyType() } diff --git a/src/main/kotlin/extractMessages.kt b/src/main/kotlin/extractMessages.kt new file mode 100644 index 0000000..2c7977c --- /dev/null +++ b/src/main/kotlin/extractMessages.kt @@ -0,0 +1,121 @@ +package center.sciprog.space.documentextractor + +import kotlinx.datetime.Clock +import kotlinx.datetime.Instant +import space.jetbrains.api.runtime.SpaceClient +import space.jetbrains.api.runtime.resources.chats +import space.jetbrains.api.runtime.types.* +import java.io.BufferedWriter +import java.nio.file.Path +import java.nio.file.StandardOpenOption +import kotlin.io.path.createDirectories +import kotlin.io.path.outputStream + +private suspend fun SpaceClient.writeMessages( + parentDirectory: Path, + writer: BufferedWriter, + id: ChannelIdentifier, + prefix: String = "", +) { + var readDateTime: Instant? = Clock.System.now() + var read: Int + //reading messages in batches + do { + val result: GetMessagesResponse = chats.messages.getChannelMessages( + channel = id, + sorting = MessagesSorting.FromNewestToOldest, + startFromDate = readDateTime, + batchSize = 50 + ) { + nextStartFromDate() + messages { + author { + name() + } + text() + created() + attachments() + thread { + content() + } + } + } + val attachmentsDirectory = parentDirectory.resolve("attachments") + attachmentsDirectory.createDirectories() + + result.messages.forEach { message -> + writer.appendLine( + """ + |* **(${message.created}) ${message.author.name}:** + |${message.text.trimIndent()} + """.replaceIndentByMargin(prefix) + ) + + message.attachments?.map { it.details }?.let { attachments -> + attachments.forEach { attachment: Attachment? -> + when (attachment) { + is FileAttachment -> { + val fileId = attachment.id + val name = "${attachment.id}-${attachment.filename}" + val file = attachmentsDirectory.resolve(name) + extractFile(file, fileId) + writer.appendLine("*Attachment*: [name](attachments/$name)\n") + } + + is ImageAttachment -> { + val fileId = attachment.id + val name = attachment.name?.let { "${attachment.id}-${attachment.name}"} ?: fileId + val file = attachmentsDirectory.resolve(name) + extractFile(file, fileId) + writer.appendLine("*Attachment*: [$name](attachments/$name)\n") + } + + is VideoAttachment -> { + val fileId = attachment.id + val name = attachment.name?.let { "${attachment.id}-${attachment.name}"} ?: fileId + val file = attachmentsDirectory.resolve(name) + extractFile(file, fileId) + writer.appendLine("*Attachment*: [$name](attachments/$name)\n") + } + } + } + } + + message.thread?.content?.let { + if (it is M2ChannelContentThread) { + writeMessages(parentDirectory, writer, ChannelIdentifier.Thread(it.record.id), " ") + } + } + } + + read = result.messages.count() + readDateTime = result.nextStartFromDate + + } while (read == 50) +} + + +suspend fun SpaceClient.extractMessages( + chatId: String, + parentDirectory: Path, +) { + val id = ChannelIdentifier.Id(chatId) + val channel = chats.channels.getChannel(id) + + val name = (channel.contact.ext as? M2SharedChannelContent)?.name ?: channel.contact.defaultName + + val file = parentDirectory.resolve("$name.md") + + file.parent.createDirectories() + + file.outputStream( + StandardOpenOption.CREATE, + StandardOpenOption.TRUNCATE_EXISTING, + StandardOpenOption.WRITE + ).bufferedWriter().use { out -> +// out.append(M2ChannelRecordStructure.serialize(channel).toPrettyString()) +// out.appendLine("") +// out.appendLine() + writeMessages(parentDirectory, out, id) + } +} \ No newline at end of file diff --git a/src/main/kotlin/extractRepos.kt b/src/main/kotlin/extractRepos.kt index f114d2b..bf0496d 100644 --- a/src/main/kotlin/extractRepos.kt +++ b/src/main/kotlin/extractRepos.kt @@ -4,9 +4,10 @@ import space.jetbrains.api.runtime.SpaceClient import space.jetbrains.api.runtime.resources.projects import space.jetbrains.api.runtime.types.ProjectIdentifier import java.nio.file.Path -import kotlin.io.path.createDirectories -import kotlin.io.path.div +/** + * Clone a single repository to a [parentDirectory] + */ private fun cloneRepo( parentDirectory: Path, url: String, @@ -25,8 +26,11 @@ private fun cloneRepo( // .call() } +/** + * Extract all repos in the project into a [parentDirectory] + */ suspend fun SpaceClient.extractRepos( - directory: Path, + parentDirectory: Path, projectId: ProjectIdentifier, ) { val repos = projects.getProject( @@ -41,7 +45,7 @@ suspend fun SpaceClient.extractRepos( project = projectId, repository = repo.name ).sshUrl ?: error("Could not resolve sshUrl for ${repo.name}") - cloneRepo(parentDirectory = directory, url) + cloneRepo(parentDirectory = parentDirectory, url) } catch (ex: Exception) { logger.error("Failed ", ex) } diff --git a/src/main/kotlin/main.kt b/src/main/kotlin/main.kt index ad41ac2..a281f96 100644 --- a/src/main/kotlin/main.kt +++ b/src/main/kotlin/main.kt @@ -170,7 +170,44 @@ private class ExtractRepositoriesCommand : ExtractCommand("repos", "Extract repo } -private class ExtractAllCommand : ExtractCommand("all", "Extract all data from a project") { +private class ExtractChatsCommand: ExtractCommand("chat","Extract all messages from a chat"){ + + val path: String by option( + ArgType.String, + description = "Target directory. Default is './chats'." + ).default("./chats") + + override fun execute() { + val urlMatch = urlRegex.matchEntire(url) ?: error("Url $url does not match space document url pattern") + + val spaceUrl = urlMatch.groups["spaceUrl"]?.value ?: error("Space Url token not recognized") + + val chatId = urlMatch.groups["chatId"]?.value ?: error("Chat id token not recognized") + + val appInstance = SpaceAppInstance( + clientId ?: System.getProperty("space.clientId"), + clientSecret ?: System.getProperty("space.clientSecret"), + spaceUrl + ) + + val spaceClient: SpaceClient = SpaceClient( + ktorClientForSpace(CIO), + appInstance, + SpaceAuth.ClientCredentials() + ) + + runBlocking { + spaceClient.extractMessages(chatId, Path(path)) + } + } + + companion object { + private val urlRegex = + """(?https?:\/\/[^\/]*)\/im\/group\/(?.*)""".toRegex() + } +} + +private class ExtractProjectCommand : ExtractCommand("project", "Extract all data from a project") { val path: String by option( ArgType.String, @@ -235,7 +272,7 @@ private class ExtractAllCommand : ExtractCommand("all", "Extract all data from a fun main(args: Array) { val parser = ArgParser("space-export") - parser.subcommands(ExtractDocumentsCommand(), ExtractRepositoriesCommand(), ExtractAllCommand()) + parser.subcommands(ExtractDocumentsCommand(), ExtractRepositoriesCommand(), ExtractProjectCommand(), ExtractChatsCommand()) parser.parse(args) } \ No newline at end of file