initial commit

This commit is contained in:
Alexander Nozik 2021-12-26 15:28:53 +03:00
commit e9439a8b9c
No known key found for this signature in database
GPG Key ID: F7FCF2DD25C71357
11 changed files with 499 additions and 0 deletions

3
.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
.idea/
.gradle/
build/

43
README.md Normal file
View File

@ -0,0 +1,43 @@
# Space Document Extractor
The aim of this repository is to help to generate stand-alone version of JetBrains Space documents. Those documents are written in MarkDown format and could include images. In order to do that one have to do several steps:
* Download a page as markdown to a directory.
* Download attached images to specific directory.
* Replace references to attachments in MarkDown files.
This project uses Space SDK to organize those steps.
## Setting up Space Application
In order to access data in Space, one needs to [create a Space Application](https://www.jetbrains.com/help/space/applications.html) and add appropriate permissions. I am not sure which permissions cover access to images, but here are those that I allowed:
* Provide external attachment unfurls
* Provide external inline unfurls
* View project data
* View book metadata
* View content
Then one needs to copy `clientId` and `clientSecret` for the application and use them as command line parameters.
## Downloading texts
Right now Space SDK does not have methods to access documents, so the only way is to copy the markdown and paste it directly to a file. I hope it will change in the future.
## Download images
The images in space documents are inserted in the following format: `![](/d/aaaabbbbcccc?f=0 "name.png")`. Our aim is to detect those links in files and download appropriate images. Those links could not be replaced directly, because access requires OAuth authentication. For that we need to use access token from Space SDK.
## Replace references
After file is successfully downloaded, the reference in file must be replaced with a local one.
## Command line interface
Typical application usage:
```commandline
.\space-document-extractor --spaceUrl https://mipt-npm.jetbrains.space --path D:\Work\report\ --clientId "your client ID" --clientSecret "your client secret"
```
It will search the directory (and subdirectories) and replace image links with downloaded image in `./images` directory.

38
build.gradle.kts Normal file
View File

@ -0,0 +1,38 @@
import org.jetbrains.kotlin.gradle.tasks.KotlinCompile
plugins {
kotlin("jvm") version "1.6.10"
application
}
group = "ru.mipt.npm"
version = "1.0-SNAPSHOT"
repositories {
mavenCentral()
maven("https://maven.pkg.jetbrains.space/public/p/space/maven")
}
val ktorVersion = "1.6.4"
dependencies {
implementation("io.ktor:ktor-client-core:$ktorVersion")
implementation("io.ktor:ktor-client-cio:$ktorVersion")
implementation("io.ktor:ktor-client-auth:$ktorVersion")
implementation("org.jetbrains.kotlinx:kotlinx-cli:0.3.4")
implementation("org.jetbrains:space-sdk-jvm:86641-beta")
implementation("ch.qos.logback:logback-classic:1.2.10")
testImplementation(kotlin("test"))
}
tasks.test {
useJUnitPlatform()
}
tasks.withType<KotlinCompile> {
kotlinOptions.jvmTarget = "11"
}
application {
mainClass.set("ru.mipt.npm.space.documentextractor.MainKt")
}

1
gradle.properties Normal file
View File

@ -0,0 +1 @@
kotlin.code.style=official

BIN
gradle/wrapper/gradle-wrapper.jar vendored Normal file

Binary file not shown.

View File

@ -0,0 +1,5 @@
distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists
distributionUrl=https\://services.gradle.org/distributions/gradle-7.3.2-bin.zip
zipStoreBase=GRADLE_USER_HOME
zipStorePath=wrapper/dists

185
gradlew vendored Normal file
View File

@ -0,0 +1,185 @@
#!/usr/bin/env sh
#
# Copyright 2015 the original author or authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
##############################################################################
##
## Gradle start up script for UN*X
##
##############################################################################
# Attempt to set APP_HOME
# Resolve links: $0 may be a link
PRG="$0"
# Need this for relative symlinks.
while [ -h "$PRG" ] ; do
ls=`ls -ld "$PRG"`
link=`expr "$ls" : '.*-> \(.*\)$'`
if expr "$link" : '/.*' > /dev/null; then
PRG="$link"
else
PRG=`dirname "$PRG"`"/$link"
fi
done
SAVED="`pwd`"
cd "`dirname \"$PRG\"`/" >/dev/null
APP_HOME="`pwd -P`"
cd "$SAVED" >/dev/null
APP_NAME="Gradle"
APP_BASE_NAME=`basename "$0"`
# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
# Use the maximum available, or set MAX_FD != -1 to use that value.
MAX_FD="maximum"
warn () {
echo "$*"
}
die () {
echo
echo "$*"
echo
exit 1
}
# OS specific support (must be 'true' or 'false').
cygwin=false
msys=false
darwin=false
nonstop=false
case "`uname`" in
CYGWIN* )
cygwin=true
;;
Darwin* )
darwin=true
;;
MSYS* | MINGW* )
msys=true
;;
NONSTOP* )
nonstop=true
;;
esac
CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
# Determine the Java command to use to start the JVM.
if [ -n "$JAVA_HOME" ] ; then
if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
# IBM's JDK on AIX uses strange locations for the executables
JAVACMD="$JAVA_HOME/jre/sh/java"
else
JAVACMD="$JAVA_HOME/bin/java"
fi
if [ ! -x "$JAVACMD" ] ; then
die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
Please set the JAVA_HOME variable in your environment to match the
location of your Java installation."
fi
else
JAVACMD="java"
which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
Please set the JAVA_HOME variable in your environment to match the
location of your Java installation."
fi
# Increase the maximum file descriptors if we can.
if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
MAX_FD_LIMIT=`ulimit -H -n`
if [ $? -eq 0 ] ; then
if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
MAX_FD="$MAX_FD_LIMIT"
fi
ulimit -n $MAX_FD
if [ $? -ne 0 ] ; then
warn "Could not set maximum file descriptor limit: $MAX_FD"
fi
else
warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
fi
fi
# For Darwin, add options to specify how the application appears in the dock
if $darwin; then
GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
fi
# For Cygwin or MSYS, switch paths to Windows format before running java
if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then
APP_HOME=`cygpath --path --mixed "$APP_HOME"`
CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
JAVACMD=`cygpath --unix "$JAVACMD"`
# We build the pattern for arguments to be converted via cygpath
ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
SEP=""
for dir in $ROOTDIRSRAW ; do
ROOTDIRS="$ROOTDIRS$SEP$dir"
SEP="|"
done
OURCYGPATTERN="(^($ROOTDIRS))"
# Add a user-defined pattern to the cygpath arguments
if [ "$GRADLE_CYGPATTERN" != "" ] ; then
OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
fi
# Now convert the arguments - kludge to limit ourselves to /bin/sh
i=0
for arg in "$@" ; do
CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option
if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition
eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
else
eval `echo args$i`="\"$arg\""
fi
i=`expr $i + 1`
done
case $i in
0) set -- ;;
1) set -- "$args0" ;;
2) set -- "$args0" "$args1" ;;
3) set -- "$args0" "$args1" "$args2" ;;
4) set -- "$args0" "$args1" "$args2" "$args3" ;;
5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
esac
fi
# Escape application args
save () {
for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
echo " "
}
APP_ARGS=`save "$@"`
# Collect all arguments for the java command, following the shell quoting and substitution rules
eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
exec "$JAVACMD" "$@"

89
gradlew.bat vendored Normal file
View File

@ -0,0 +1,89 @@
@rem
@rem Copyright 2015 the original author or authors.
@rem
@rem Licensed under the Apache License, Version 2.0 (the "License");
@rem you may not use this file except in compliance with the License.
@rem You may obtain a copy of the License at
@rem
@rem https://www.apache.org/licenses/LICENSE-2.0
@rem
@rem Unless required by applicable law or agreed to in writing, software
@rem distributed under the License is distributed on an "AS IS" BASIS,
@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@rem See the License for the specific language governing permissions and
@rem limitations under the License.
@rem
@if "%DEBUG%" == "" @echo off
@rem ##########################################################################
@rem
@rem Gradle startup script for Windows
@rem
@rem ##########################################################################
@rem Set local scope for the variables with windows NT shell
if "%OS%"=="Windows_NT" setlocal
set DIRNAME=%~dp0
if "%DIRNAME%" == "" set DIRNAME=.
set APP_BASE_NAME=%~n0
set APP_HOME=%DIRNAME%
@rem Resolve any "." and ".." in APP_HOME to make it shorter.
for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi
@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
@rem Find java.exe
if defined JAVA_HOME goto findJavaFromJavaHome
set JAVA_EXE=java.exe
%JAVA_EXE% -version >NUL 2>&1
if "%ERRORLEVEL%" == "0" goto execute
echo.
echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
echo.
echo Please set the JAVA_HOME variable in your environment to match the
echo location of your Java installation.
goto fail
:findJavaFromJavaHome
set JAVA_HOME=%JAVA_HOME:"=%
set JAVA_EXE=%JAVA_HOME%/bin/java.exe
if exist "%JAVA_EXE%" goto execute
echo.
echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
echo.
echo Please set the JAVA_HOME variable in your environment to match the
echo location of your Java installation.
goto fail
:execute
@rem Setup the command line
set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
@rem Execute Gradle
"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %*
:end
@rem End local scope for the variables with windows NT shell
if "%ERRORLEVEL%"=="0" goto mainEnd
:fail
rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
rem the _cmd.exe /c_ return code!
if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
exit /b 1
:mainEnd
if "%OS%"=="Windows_NT" endlocal
:omega

3
settings.gradle.kts Normal file
View File

@ -0,0 +1,3 @@
rootProject.name = "space-document-extractor"

56
src/main/kotlin/main.kt Normal file
View File

@ -0,0 +1,56 @@
package ru.mipt.npm.space.documentextractor
import io.ktor.client.HttpClient
import io.ktor.client.engine.cio.CIO
import kotlinx.cli.ArgParser
import kotlinx.cli.ArgType
import kotlinx.cli.required
import space.jetbrains.api.runtime.SpaceHttpClient
import space.jetbrains.api.runtime.SpaceHttpClientWithCallContext
import space.jetbrains.api.runtime.withServiceAccountTokenSource
import java.nio.file.Path
import kotlin.io.path.exists
import kotlin.io.path.isDirectory
suspend fun main(args: Array<String>) {
val parser = ArgParser("space-document-extractor")
val path by parser.option(ArgType.String, description = "Input file or directory").required()
val spaceUrl by parser.option(
ArgType.String,
description = "Url of the space instance like 'https://mipt-npm.jetbrains.space'"
).required()
val clientId by parser.option(
ArgType.String,
description = "Space application client ID (if not defined, use environment value 'space.clientId')"
)
val clientSecret by parser.option(
ArgType.String,
description = "Space application client secret (if not defined, use environment value 'space.clientSecret')"
)
parser.parse(args)
val pathValue: Path = Path.of(path)
if (!pathValue.exists()) {
error("File or directory not found at $path")
}
val client = HttpClient(CIO)
val space: SpaceHttpClientWithCallContext = SpaceHttpClient(client).withServiceAccountTokenSource(
clientId = clientId ?: System.getProperty("space.clientId"),
clientSecret = clientSecret ?: System.getProperty("space.clientSecret"),
serverUrl = "https://mipt-npm.jetbrains.space"
)
if (pathValue.isDirectory()) {
space.processDirectory(client, spaceUrl, pathValue)
} else {
space.processDocument(client, spaceUrl, pathValue)
}
}

View File

@ -0,0 +1,76 @@
package ru.mipt.npm.space.documentextractor
import io.ktor.client.HttpClient
import io.ktor.client.request.HttpRequestBuilder
import io.ktor.client.request.header
import io.ktor.client.request.request
import io.ktor.client.request.url
import io.ktor.client.statement.HttpResponse
import io.ktor.client.statement.readBytes
import io.ktor.http.HttpHeaders
import io.ktor.http.HttpMethod
import kotlinx.coroutines.coroutineScope
import kotlinx.coroutines.launch
import org.slf4j.LoggerFactory
import space.jetbrains.api.runtime.SpaceHttpClientWithCallContext
import java.nio.file.Files
import java.nio.file.Path
import kotlin.io.path.isDirectory
import kotlin.io.path.readText
import kotlin.io.path.writeBytes
import kotlin.io.path.writeText
import kotlin.streams.toList
suspend fun SpaceHttpClientWithCallContext.extractImage(
client: HttpClient,
spaceUrl: String,
parent: Path,
imageId: String,
imageFileName: String,
) {
val request = HttpRequestBuilder().apply {
val token = callContext.tokenSource.token()
url("$spaceUrl/d/$imageId")
method = HttpMethod.Get
header(HttpHeaders.Authorization, "Bearer ${token.accessToken}")
}
val response = client.request<HttpResponse>(request)
val file = parent.resolve("images/$imageFileName")
file.writeBytes(response.readBytes())
}
private val regex = """!\[(?<fileName>.*)]\(/d/(?<id>.*)\?f=0""".toRegex()
suspend fun SpaceHttpClientWithCallContext.processDocument(client: HttpClient, spaceUrl: String, path: Path) {
val documentBody = path.readText()
val logger = LoggerFactory.getLogger("space-document-extractor")
logger.info("Processing file $path...")
coroutineScope {
val newText = documentBody.replace(regex) {
val id = it.groups["id"]?.value ?: error("Unexpected reference format: ${it.value}")
val fileName = it.groups["fileName"]?.value ?: id
launch {
logger.info("Downloading image $id")
extractImage(client, spaceUrl, path.parent, id, fileName)
}
"![](images/$fileName"
}
path.writeText(newText)
}
}
suspend fun SpaceHttpClientWithCallContext.processDirectory(
client: HttpClient,
spaceUrl: String,
path: Path,
fileExtension: String = ".md",
recursive: Boolean = true,
) {
Files.list(path).toList().forEach {
if (it.toString().endsWith(fileExtension)) {
processDocument(client, spaceUrl, it)
} else if (recursive && it.isDirectory()) {
processDirectory(client, spaceUrl, it, fileExtension)
}
}
}