commit e9439a8b9cde3877837cd0c6f2c6f8aace5a9dc0 Author: Alexander Nozik Date: Sun Dec 26 15:28:53 2021 +0300 initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..cd3d2f4 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +.idea/ +.gradle/ +build/ \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..4ec26ba --- /dev/null +++ b/README.md @@ -0,0 +1,43 @@ +# Space Document Extractor + +The aim of this repository is to help to generate stand-alone version of JetBrains Space documents. Those documents are written in MarkDown format and could include images. In order to do that one have to do several steps: + +* Download a page as markdown to a directory. +* Download attached images to specific directory. +* Replace references to attachments in MarkDown files. + +This project uses Space SDK to organize those steps. + +## Setting up Space Application + +In order to access data in Space, one needs to [create a Space Application](https://www.jetbrains.com/help/space/applications.html) and add appropriate permissions. I am not sure which permissions cover access to images, but here are those that I allowed: + +* Provide external attachment unfurls +* Provide external inline unfurls +* View project data +* View book metadata +* View content + +Then one needs to copy `clientId` and `clientSecret` for the application and use them as command line parameters. + +## Downloading texts + +Right now Space SDK does not have methods to access documents, so the only way is to copy the markdown and paste it directly to a file. I hope it will change in the future. + +## Download images + +The images in space documents are inserted in the following format: `![](/d/aaaabbbbcccc?f=0 "name.png")`. Our aim is to detect those links in files and download appropriate images. Those links could not be replaced directly, because access requires OAuth authentication. For that we need to use access token from Space SDK. + +## Replace references + +After file is successfully downloaded, the reference in file must be replaced with a local one. + +## Command line interface + +Typical application usage: + +```commandline +.\space-document-extractor --spaceUrl https://mipt-npm.jetbrains.space --path D:\Work\report\ --clientId "your client ID" --clientSecret "your client secret" +``` + +It will search the directory (and subdirectories) and replace image links with downloaded image in `./images` directory. \ No newline at end of file diff --git a/build.gradle.kts b/build.gradle.kts new file mode 100644 index 0000000..00a1f37 --- /dev/null +++ b/build.gradle.kts @@ -0,0 +1,38 @@ +import org.jetbrains.kotlin.gradle.tasks.KotlinCompile + +plugins { + kotlin("jvm") version "1.6.10" + application +} + +group = "ru.mipt.npm" +version = "1.0-SNAPSHOT" + +repositories { + mavenCentral() + maven("https://maven.pkg.jetbrains.space/public/p/space/maven") +} + +val ktorVersion = "1.6.4" + +dependencies { + implementation("io.ktor:ktor-client-core:$ktorVersion") + implementation("io.ktor:ktor-client-cio:$ktorVersion") + implementation("io.ktor:ktor-client-auth:$ktorVersion") + implementation("org.jetbrains.kotlinx:kotlinx-cli:0.3.4") + implementation("org.jetbrains:space-sdk-jvm:86641-beta") + implementation("ch.qos.logback:logback-classic:1.2.10") + testImplementation(kotlin("test")) +} + +tasks.test { + useJUnitPlatform() +} + +tasks.withType { + kotlinOptions.jvmTarget = "11" +} + +application { + mainClass.set("ru.mipt.npm.space.documentextractor.MainKt") +} \ No newline at end of file diff --git a/gradle.properties b/gradle.properties new file mode 100644 index 0000000..7fc6f1f --- /dev/null +++ b/gradle.properties @@ -0,0 +1 @@ +kotlin.code.style=official diff --git a/gradle/wrapper/gradle-wrapper.jar b/gradle/wrapper/gradle-wrapper.jar new file mode 100644 index 0000000..7454180 Binary files /dev/null and b/gradle/wrapper/gradle-wrapper.jar differ diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties new file mode 100644 index 0000000..d2880ba --- /dev/null +++ b/gradle/wrapper/gradle-wrapper.properties @@ -0,0 +1,5 @@ +distributionBase=GRADLE_USER_HOME +distributionPath=wrapper/dists +distributionUrl=https\://services.gradle.org/distributions/gradle-7.3.2-bin.zip +zipStoreBase=GRADLE_USER_HOME +zipStorePath=wrapper/dists diff --git a/gradlew b/gradlew new file mode 100644 index 0000000..744e882 --- /dev/null +++ b/gradlew @@ -0,0 +1,185 @@ +#!/usr/bin/env sh + +# +# Copyright 2015 the original author or authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +############################################################################## +## +## Gradle start up script for UN*X +## +############################################################################## + +# Attempt to set APP_HOME +# Resolve links: $0 may be a link +PRG="$0" +# Need this for relative symlinks. +while [ -h "$PRG" ] ; do + ls=`ls -ld "$PRG"` + link=`expr "$ls" : '.*-> \(.*\)$'` + if expr "$link" : '/.*' > /dev/null; then + PRG="$link" + else + PRG=`dirname "$PRG"`"/$link" + fi +done +SAVED="`pwd`" +cd "`dirname \"$PRG\"`/" >/dev/null +APP_HOME="`pwd -P`" +cd "$SAVED" >/dev/null + +APP_NAME="Gradle" +APP_BASE_NAME=`basename "$0"` + +# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. +DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"' + +# Use the maximum available, or set MAX_FD != -1 to use that value. +MAX_FD="maximum" + +warn () { + echo "$*" +} + +die () { + echo + echo "$*" + echo + exit 1 +} + +# OS specific support (must be 'true' or 'false'). +cygwin=false +msys=false +darwin=false +nonstop=false +case "`uname`" in + CYGWIN* ) + cygwin=true + ;; + Darwin* ) + darwin=true + ;; + MSYS* | MINGW* ) + msys=true + ;; + NONSTOP* ) + nonstop=true + ;; +esac + +CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar + + +# Determine the Java command to use to start the JVM. +if [ -n "$JAVA_HOME" ] ; then + if [ -x "$JAVA_HOME/jre/sh/java" ] ; then + # IBM's JDK on AIX uses strange locations for the executables + JAVACMD="$JAVA_HOME/jre/sh/java" + else + JAVACMD="$JAVA_HOME/bin/java" + fi + if [ ! -x "$JAVACMD" ] ; then + die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME + +Please set the JAVA_HOME variable in your environment to match the +location of your Java installation." + fi +else + JAVACMD="java" + which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. + +Please set the JAVA_HOME variable in your environment to match the +location of your Java installation." +fi + +# Increase the maximum file descriptors if we can. +if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then + MAX_FD_LIMIT=`ulimit -H -n` + if [ $? -eq 0 ] ; then + if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then + MAX_FD="$MAX_FD_LIMIT" + fi + ulimit -n $MAX_FD + if [ $? -ne 0 ] ; then + warn "Could not set maximum file descriptor limit: $MAX_FD" + fi + else + warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT" + fi +fi + +# For Darwin, add options to specify how the application appears in the dock +if $darwin; then + GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\"" +fi + +# For Cygwin or MSYS, switch paths to Windows format before running java +if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then + APP_HOME=`cygpath --path --mixed "$APP_HOME"` + CLASSPATH=`cygpath --path --mixed "$CLASSPATH"` + + JAVACMD=`cygpath --unix "$JAVACMD"` + + # We build the pattern for arguments to be converted via cygpath + ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null` + SEP="" + for dir in $ROOTDIRSRAW ; do + ROOTDIRS="$ROOTDIRS$SEP$dir" + SEP="|" + done + OURCYGPATTERN="(^($ROOTDIRS))" + # Add a user-defined pattern to the cygpath arguments + if [ "$GRADLE_CYGPATTERN" != "" ] ; then + OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)" + fi + # Now convert the arguments - kludge to limit ourselves to /bin/sh + i=0 + for arg in "$@" ; do + CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -` + CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option + + if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition + eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"` + else + eval `echo args$i`="\"$arg\"" + fi + i=`expr $i + 1` + done + case $i in + 0) set -- ;; + 1) set -- "$args0" ;; + 2) set -- "$args0" "$args1" ;; + 3) set -- "$args0" "$args1" "$args2" ;; + 4) set -- "$args0" "$args1" "$args2" "$args3" ;; + 5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;; + 6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;; + 7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;; + 8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;; + 9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;; + esac +fi + +# Escape application args +save () { + for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done + echo " " +} +APP_ARGS=`save "$@"` + +# Collect all arguments for the java command, following the shell quoting and substitution rules +eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS" + +exec "$JAVACMD" "$@" diff --git a/gradlew.bat b/gradlew.bat new file mode 100644 index 0000000..107acd3 --- /dev/null +++ b/gradlew.bat @@ -0,0 +1,89 @@ +@rem +@rem Copyright 2015 the original author or authors. +@rem +@rem Licensed under the Apache License, Version 2.0 (the "License"); +@rem you may not use this file except in compliance with the License. +@rem You may obtain a copy of the License at +@rem +@rem https://www.apache.org/licenses/LICENSE-2.0 +@rem +@rem Unless required by applicable law or agreed to in writing, software +@rem distributed under the License is distributed on an "AS IS" BASIS, +@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@rem See the License for the specific language governing permissions and +@rem limitations under the License. +@rem + +@if "%DEBUG%" == "" @echo off +@rem ########################################################################## +@rem +@rem Gradle startup script for Windows +@rem +@rem ########################################################################## + +@rem Set local scope for the variables with windows NT shell +if "%OS%"=="Windows_NT" setlocal + +set DIRNAME=%~dp0 +if "%DIRNAME%" == "" set DIRNAME=. +set APP_BASE_NAME=%~n0 +set APP_HOME=%DIRNAME% + +@rem Resolve any "." and ".." in APP_HOME to make it shorter. +for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi + +@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. +set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m" + +@rem Find java.exe +if defined JAVA_HOME goto findJavaFromJavaHome + +set JAVA_EXE=java.exe +%JAVA_EXE% -version >NUL 2>&1 +if "%ERRORLEVEL%" == "0" goto execute + +echo. +echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. +echo. +echo Please set the JAVA_HOME variable in your environment to match the +echo location of your Java installation. + +goto fail + +:findJavaFromJavaHome +set JAVA_HOME=%JAVA_HOME:"=% +set JAVA_EXE=%JAVA_HOME%/bin/java.exe + +if exist "%JAVA_EXE%" goto execute + +echo. +echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% +echo. +echo Please set the JAVA_HOME variable in your environment to match the +echo location of your Java installation. + +goto fail + +:execute +@rem Setup the command line + +set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar + + +@rem Execute Gradle +"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %* + +:end +@rem End local scope for the variables with windows NT shell +if "%ERRORLEVEL%"=="0" goto mainEnd + +:fail +rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of +rem the _cmd.exe /c_ return code! +if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 +exit /b 1 + +:mainEnd +if "%OS%"=="Windows_NT" endlocal + +:omega diff --git a/settings.gradle.kts b/settings.gradle.kts new file mode 100644 index 0000000..d10b0f9 --- /dev/null +++ b/settings.gradle.kts @@ -0,0 +1,3 @@ + +rootProject.name = "space-document-extractor" + diff --git a/src/main/kotlin/main.kt b/src/main/kotlin/main.kt new file mode 100644 index 0000000..81055b7 --- /dev/null +++ b/src/main/kotlin/main.kt @@ -0,0 +1,56 @@ +package ru.mipt.npm.space.documentextractor + +import io.ktor.client.HttpClient +import io.ktor.client.engine.cio.CIO +import kotlinx.cli.ArgParser +import kotlinx.cli.ArgType +import kotlinx.cli.required +import space.jetbrains.api.runtime.SpaceHttpClient +import space.jetbrains.api.runtime.SpaceHttpClientWithCallContext +import space.jetbrains.api.runtime.withServiceAccountTokenSource +import java.nio.file.Path +import kotlin.io.path.exists +import kotlin.io.path.isDirectory + +suspend fun main(args: Array) { + val parser = ArgParser("space-document-extractor") + val path by parser.option(ArgType.String, description = "Input file or directory").required() + + val spaceUrl by parser.option( + ArgType.String, + description = "Url of the space instance like 'https://mipt-npm.jetbrains.space'" + ).required() + + val clientId by parser.option( + ArgType.String, + description = "Space application client ID (if not defined, use environment value 'space.clientId')" + ) + + val clientSecret by parser.option( + ArgType.String, + description = "Space application client secret (if not defined, use environment value 'space.clientSecret')" + ) + + + parser.parse(args) + + val pathValue: Path = Path.of(path) + + if (!pathValue.exists()) { + error("File or directory not found at $path") + } + + val client = HttpClient(CIO) + val space: SpaceHttpClientWithCallContext = SpaceHttpClient(client).withServiceAccountTokenSource( + clientId = clientId ?: System.getProperty("space.clientId"), + clientSecret = clientSecret ?: System.getProperty("space.clientSecret"), + serverUrl = "https://mipt-npm.jetbrains.space" + ) + + if (pathValue.isDirectory()) { + space.processDirectory(client, spaceUrl, pathValue) + } else { + space.processDocument(client, spaceUrl, pathValue) + } + +} \ No newline at end of file diff --git a/src/main/kotlin/process.kt b/src/main/kotlin/process.kt new file mode 100644 index 0000000..a37bfb0 --- /dev/null +++ b/src/main/kotlin/process.kt @@ -0,0 +1,76 @@ +package ru.mipt.npm.space.documentextractor + +import io.ktor.client.HttpClient +import io.ktor.client.request.HttpRequestBuilder +import io.ktor.client.request.header +import io.ktor.client.request.request +import io.ktor.client.request.url +import io.ktor.client.statement.HttpResponse +import io.ktor.client.statement.readBytes +import io.ktor.http.HttpHeaders +import io.ktor.http.HttpMethod +import kotlinx.coroutines.coroutineScope +import kotlinx.coroutines.launch +import org.slf4j.LoggerFactory +import space.jetbrains.api.runtime.SpaceHttpClientWithCallContext +import java.nio.file.Files +import java.nio.file.Path +import kotlin.io.path.isDirectory +import kotlin.io.path.readText +import kotlin.io.path.writeBytes +import kotlin.io.path.writeText +import kotlin.streams.toList + +suspend fun SpaceHttpClientWithCallContext.extractImage( + client: HttpClient, + spaceUrl: String, + parent: Path, + imageId: String, + imageFileName: String, +) { + val request = HttpRequestBuilder().apply { + val token = callContext.tokenSource.token() + url("$spaceUrl/d/$imageId") + method = HttpMethod.Get + header(HttpHeaders.Authorization, "Bearer ${token.accessToken}") + } + val response = client.request(request) + val file = parent.resolve("images/$imageFileName") + file.writeBytes(response.readBytes()) +} + +private val regex = """!\[(?.*)]\(/d/(?.*)\?f=0""".toRegex() + +suspend fun SpaceHttpClientWithCallContext.processDocument(client: HttpClient, spaceUrl: String, path: Path) { + val documentBody = path.readText() + val logger = LoggerFactory.getLogger("space-document-extractor") + logger.info("Processing file $path...") + coroutineScope { + val newText = documentBody.replace(regex) { + val id = it.groups["id"]?.value ?: error("Unexpected reference format: ${it.value}") + val fileName = it.groups["fileName"]?.value ?: id + launch { + logger.info("Downloading image $id") + extractImage(client, spaceUrl, path.parent, id, fileName) + } + "![](images/$fileName" + } + path.writeText(newText) + } +} + +suspend fun SpaceHttpClientWithCallContext.processDirectory( + client: HttpClient, + spaceUrl: String, + path: Path, + fileExtension: String = ".md", + recursive: Boolean = true, +) { + Files.list(path).toList().forEach { + if (it.toString().endsWith(fileExtension)) { + processDocument(client, spaceUrl, it) + } else if (recursive && it.isDirectory()) { + processDirectory(client, spaceUrl, it, fileExtension) + } + } +} \ No newline at end of file