Fully working document processing

This commit is contained in:
Alexander Nozik 2024-04-30 19:00:37 +03:00
parent f70f1417a8
commit 7a2b5c1768
16 changed files with 291 additions and 95 deletions

View File

@ -1,9 +1,8 @@
---
type: markdown
order: 1
contentType: markdown
---
# ${documentName}
Document name: ${documentName}
${documentMeta.metaValue}

View File

@ -1,6 +1,5 @@
---
type: markdown
order: 3
contentType: markdown
---
## Chapter ${section(1)}

View File

@ -0,0 +1,14 @@
route: lorem.ipsum
title: Lorem Ipsum
authors:
- name: Alexander Nozik
affiliation: MIPT
fragments:
- name: chapter1
type: data
- name: chapter2
type: data
- name: chapter3
type: data
documentMeta:
metaValue: Hello world!

View File

@ -1,24 +1,42 @@
import io.ktor.server.application.Application
import io.ktor.server.cio.CIO
import io.ktor.server.engine.embeddedServer
import space.kscience.dataforge.meta.Meta
import space.kscience.dataforge.names.asName
import space.kscience.snark.html.document.document
import space.kscience.snark.html.document.fragment
import kotlinx.html.ScriptCrossorigin
import kotlinx.html.link
import kotlinx.html.script
import space.kscience.snark.html.document.allDocuments
@Suppress("unused")
fun Application.documents() = snarkApplication {
document("loremIpsum".asName(), Meta { "metaValue" put "Hello world!" }) {
fragment("chapter1")
fragment("chapter2")
fragment("chapter3")
}
fun Application.renderAllDocuments() = snarkApplication {
allDocuments(
headers = {
link {
rel = "stylesheet"
href = "https://cdn.jsdelivr.net/npm/katex@0.16.10/dist/katex.min.css"
attributes["integrity"] = "sha384-wcIxkf4k558AjM3Yz3BBFQUbk/zgIYC2R0QpeeYb+TwlBVMrlgLqwRjRtGZiK7ww"
attributes["crossorigin"] = "anonymous"
}
script {
defer = true
src = "https://cdn.jsdelivr.net/npm/katex@0.16.10/dist/katex.min.js"
integrity = "sha384-hIoBPJpTUs74ddyc4bFZSM1TVlQDA60VBbJS0oA934VSz82sBx1X7kSx2ATBDIyd"
crossorigin = ScriptCrossorigin.anonymous
}
script {
defer = true
src = "https://cdn.jsdelivr.net/npm/katex@0.16.10/dist/contrib/auto-render.min.js"
integrity = "sha384-43gviWU0YVjaDtb/GhzOouOXtZMP/7XUzwPTstBeZFe/+rCMvRwr4yROQP43s0Xk"
crossorigin = ScriptCrossorigin.anonymous
attributes["onload"] = "renderMathInElement(document.body);"
}
}
)
}
fun main() {
embeddedServer(CIO) {
documents()
renderAllDocuments()
}.start(true)
}

View File

@ -1,5 +1,6 @@
import io.ktor.server.application.Application
import io.ktor.server.application.log
import io.ktor.server.http.content.staticResources
import io.ktor.server.routing.routing
import space.kscience.dataforge.context.Context
import space.kscience.dataforge.context.ContextBuilder
@ -42,6 +43,7 @@ fun Application.snarkApplication(contextBuilder: ContextBuilder.() -> Unit = {},
}
routing {
staticResources("/css","css")
site(context, siteData, content = site)
}
}

View File

@ -15,12 +15,12 @@ import kotlin.reflect.typeOf
public class ReWrapAction<R : Any>(
type: KType,
private val newMeta: MutableMeta.(name: Name) -> Unit = {},
private val newName: (name: Name, meta: Meta?) -> Name,
private val newName: (name: Name, meta: Meta?, type: KType) -> Name,
) : AbstractAction<R, R>(type) {
override fun DataSink<R>.generate(data: DataTree<R>, meta: Meta) {
data.forEach { namedData ->
put(
newName(namedData.name, namedData.meta),
newName(namedData.name, namedData.meta, namedData.type),
namedData.data.withMeta(namedData.meta.copy { newMeta(namedData.name) })
)
}
@ -28,34 +28,19 @@ public class ReWrapAction<R : Any>(
override fun DataSink<R>.update(source: DataTree<R>, meta: Meta, namedData: NamedData<R>) {
put(
newName(namedData.name, namedData.meta),
newName(namedData.name, namedData.meta, namedData.type),
namedData.withMeta(namedData.meta.copy { newMeta(namedData.name) })
)
}
public companion object {
public inline fun <reified R : Any> removeExtensions(
vararg bypassExtensions: String,
noinline newMeta: MutableMeta.(name: Name) -> Unit = {},
): ReWrapAction<R> = ReWrapAction(typeOf<R>(), newMeta = newMeta) { name, _ ->
name.replaceLast { token ->
val extension = token.body.substringAfterLast('.')
if (extension in bypassExtensions) {
NameToken(token.body.removeSuffix(".$extension"))
} else {
token
}
}
}
public inline fun <reified R : Any> removeIndex(): ReWrapAction<R> = ReWrapAction<R>(typeOf<R>()) { name, _ ->
if (name.endsWith("index")) name.cutLast() else name
}
}
}
public inline fun <reified R : Any> ReWrapAction(
noinline newMeta: MutableMeta.(name: Name) -> Unit = {},
noinline newName: (Name, Meta?) -> Name,
noinline newName: (Name, Meta?, type: KType) -> Name,
): ReWrapAction<R> = ReWrapAction(typeOf<R>(), newMeta, newName)

View File

@ -5,10 +5,13 @@ import space.kscience.dataforge.io.asBinary
import space.kscience.dataforge.misc.DfType
import space.kscience.snark.SnarkReader.Companion.DEFAULT_PRIORITY
import space.kscience.snark.SnarkReader.Companion.DF_TYPE
import kotlin.reflect.KType
import kotlin.reflect.typeOf
@DfType(DF_TYPE)
public interface SnarkReader<out T> : IOReader<T> {
public val types: Set<String>
public val outputType: KType
public val inputContentTypes: Set<String>
public val priority: Int get() = DEFAULT_PRIORITY
public fun readFrom(source: String): T
@ -23,13 +26,14 @@ public interface SnarkReader<out T> : IOReader<T> {
*
* @param T The type of data to be read by the IOReader.
* @property reader The underlying IOReader instance used for reading data.
* @property types The set of supported types that can be read by the SnarkIOReader.
* @property inputContentTypes The set of supported types that can be read by the SnarkIOReader.
* @property priority The priority of the SnarkIOReader. Higher priority SnarkIOReader instances will be preferred over lower priority ones.
*/
private class SnarkReaderWrapper<out T>(
private val reader: IOReader<T>,
override val types: Set<String>,
override val outputType: KType,
override val inputContentTypes: Set<String>,
override val priority: Int = DEFAULT_PRIORITY,
) : IOReader<T> by reader, SnarkReader<T> {
@ -38,6 +42,14 @@ private class SnarkReaderWrapper<out T>(
public fun <T : Any> SnarkReader(
reader: IOReader<T>,
vararg types: String,
outputType: KType,
vararg inputContentTypes: String,
priority: Int = DEFAULT_PRIORITY,
): SnarkReader<T> = SnarkReaderWrapper(reader, types.toSet(), priority)
): SnarkReader<T> = SnarkReaderWrapper(reader, outputType, inputContentTypes.toSet(), priority)
public inline fun <reified T : Any> SnarkReader(
reader: IOReader<T>,
vararg inputContentTypes: String,
priority: Int = DEFAULT_PRIORITY,
): SnarkReader<T> = SnarkReader(reader, typeOf<T>(), inputContentTypes = inputContentTypes, priority)

View File

@ -8,6 +8,7 @@ val ktorVersion = space.kscience.gradle.KScienceVersions.ktorVersion
kscience{
jvm()
useSerialization()
useContextReceivers()
commonMain{
api(projects.snarkCore)

View File

@ -71,7 +71,7 @@ public object SnarkFlavorDescriptor : GFMFlavourDescriptor(false) {
}
public object MarkdownReader : SnarkHtmlReader {
override val types: Set<String> = setOf("text/markdown", "md", "markdown")
override val inputContentTypes: Set<String> = setOf("text/markdown", "md", "markdown")
override fun readFrom(source: String): PageFragment = PageFragment {
val parsedTree = markdownParser.parse(IElementType("ROOT"), source)
@ -88,7 +88,4 @@ public object MarkdownReader : SnarkHtmlReader {
private val markdownParser = MarkdownParser(markdownFlavor)
override fun readFrom(source: Source): PageFragment = readFrom(source.readString())
public val snarkReader: SnarkReader<PageFragment> = SnarkReader(this, "text/markdown")
}

View File

@ -6,17 +6,35 @@ import space.kscience.dataforge.io.Binary
import space.kscience.dataforge.io.toByteArray
import space.kscience.dataforge.meta.Meta
import space.kscience.dataforge.meta.get
import space.kscience.dataforge.misc.DFInternal
import space.kscience.snark.SnarkReader
import space.kscience.snark.TextProcessor
import kotlin.coroutines.CoroutineContext
import kotlin.coroutines.EmptyCoroutineContext
import kotlin.reflect.KType
import kotlin.reflect.typeOf
public class ParseAction(private val snarkHtml: SnarkHtml) :
AbstractAction<Binary, PageFragment>(typeOf<PageFragment>()) {
@OptIn(DFInternal::class)
internal fun <T, R> Data<T>.transform(
type: KType,
meta: Meta = this.meta,
coroutineContext: CoroutineContext = EmptyCoroutineContext,
block: suspend (T) -> R,
): Data<R> {
val data = Data(type, meta, coroutineContext, listOf(this)) {
block(await())
}
return data
}
private fun parseOne(data: NamedData<Binary>): NamedData<PageFragment>? = with(snarkHtml) {
public class ParseAction(private val snarkHtml: SnarkHtml) :
AbstractAction<Binary, Any>(typeOf<PageFragment>()) {
private fun parseOne(data: NamedData<Binary>): NamedData<Any>? = with(snarkHtml) {
val contentType = getContentType(data.name, data.meta)
val parser = snark.readers.values.filterIsInstance<SnarkHtmlReader>().filter { parser ->
contentType in parser.types
val parser: SnarkReader<Any>? = snark.readers.values.filter { parser ->
contentType in parser.inputContentTypes
}.maxByOrNull {
it.priority
}
@ -24,7 +42,7 @@ public class ParseAction(private val snarkHtml: SnarkHtml) :
//ignore data for which parser is not found
if (parser != null) {
val preprocessor = meta[TextProcessor.TEXT_PREPROCESSOR_KEY]?.let { snark.preprocessor(it) }
data.transform {
data.transform(parser.outputType) {
if (preprocessor == null) {
parser.readFrom(it)
} else {
@ -38,13 +56,13 @@ public class ParseAction(private val snarkHtml: SnarkHtml) :
}
}
override fun DataSink<PageFragment>.generate(data: DataTree<Binary>, meta: Meta) {
override fun DataSink<Any>.generate(data: DataTree<Binary>, meta: Meta) {
data.forEach {
parseOne(it)?.let { put(it) }
}
}
override fun DataSink<PageFragment>.update(source: DataTree<Binary>, meta: Meta, namedData: NamedData<Binary>) {
override fun DataSink<Any>.update(source: DataTree<Binary>, meta: Meta, namedData: NamedData<Binary>) {
parseOne(namedData)?.let { put(it) }
}
}

View File

@ -22,8 +22,7 @@ import space.kscience.dataforge.meta.get
import space.kscience.dataforge.meta.set
import space.kscience.dataforge.meta.string
import space.kscience.dataforge.misc.DFExperimental
import space.kscience.dataforge.names.Name
import space.kscience.dataforge.names.asName
import space.kscience.dataforge.names.*
import space.kscience.dataforge.provider.dfType
import space.kscience.dataforge.workspace.*
import space.kscience.snark.ReWrapAction
@ -33,6 +32,7 @@ import space.kscience.snark.TextProcessor
import java.net.URLConnection
import kotlin.io.path.Path
import kotlin.io.path.extension
import kotlin.reflect.typeOf
public fun <T : Any, R : Any> DataTree<T>.transform(action: Action<T, R>, meta: Meta = Meta.EMPTY): DataTree<R> =
@ -52,8 +52,8 @@ public class SnarkHtml : WorkspacePlugin() {
SnarkReader::class.dfType -> mapOf(
"html".asName() to HtmlReader,
"markdown".asName() to MarkdownReader,
"json".asName() to SnarkReader(JsonMetaFormat, ContentType.Application.Json.toString()),
"yaml".asName() to SnarkReader(YamlMetaFormat, "text/yaml", "yaml"),
"json".asName() to SnarkReader<Meta>(JsonMetaFormat, ContentType.Application.Json.toString()),
"yaml".asName() to SnarkReader<Meta>(YamlMetaFormat, "text/yaml", "yaml"),
)
else -> super.content(target)
@ -64,16 +64,33 @@ public class SnarkHtml : WorkspacePlugin() {
URLConnection.guessContentTypeFromName(filePath) ?: Path(filePath).extension
}
public val prepareHeaderAction: ReWrapAction<Any> = ReWrapAction.removeExtensions<Any>("html", "md") { name ->
val contentType = getContentType(name, this)
set(CONTENT_TYPE_KEY, contentType)
internal val prepareHeaderAction: ReWrapAction<Any> = ReWrapAction(
type = typeOf<Any>(),
newMeta = { name ->
val contentType = getContentType(name, this)
set(FILE_NAME_KEY, name.last().toStringUnescaped())
set(CONTENT_TYPE_KEY, contentType)
}
) { name, _, type ->
name.replaceLast { token ->
val extension = token.body.substringAfterLast('.')
if (type != typeOf<Binary>()) {
NameToken(token.body.removeSuffix(".$extension"))
} else {
token
}
}
}
public fun parse(name: Name, markup: String, meta: Meta): PageFragment {
public val removeIndexAction: ReWrapAction<Any> = ReWrapAction(typeOf<Any>()) { name, _, _ ->
if (name.endsWith("index")) name.cutLast() else name
}
public fun parseMarkup(name: Name, markup: String, meta: Meta): PageFragment {
val contentType = getContentType(name, meta)
val parser = snark.readers.values.filterIsInstance<SnarkHtmlReader>().filter { parser ->
contentType in parser.types
contentType in parser.inputContentTypes
}.maxByOrNull {
it.priority
} ?: error("Parser for name $name and meta $meta not found")
@ -88,10 +105,8 @@ public class SnarkHtml : WorkspacePlugin() {
}
}
public val parseAction: Action<Binary, Any> = ParseAction(this)
public val removeIndexAction: ReWrapAction<Any> = ReWrapAction.removeIndex<Any>()
public val parseAction: Action<Binary, PageFragment> = ParseAction(this)
private val allDataNotNull: DataSelector<Any>
get() = DataSelector { workspace, _ -> workspace.data.filterByType() }
@ -108,6 +123,8 @@ public class SnarkHtml : WorkspacePlugin() {
public companion object : PluginFactory<SnarkHtml> {
override val tag: PluginTag = PluginTag("snark.html")
public val FILE_NAME_KEY: Name = "contentType".asName()
public val CONTENT_TYPE_KEY: Name = "contentType".asName()
override fun build(context: Context, meta: Meta): SnarkHtml = SnarkHtml()

View File

@ -5,12 +5,16 @@ import kotlinx.html.unsafe
import kotlinx.io.Source
import kotlinx.io.readString
import space.kscience.snark.SnarkReader
import kotlin.reflect.KType
import kotlin.reflect.typeOf
public interface SnarkHtmlReader : SnarkReader<PageFragment>
public interface SnarkHtmlReader : SnarkReader<PageFragment>{
override val outputType: KType get() = typeOf<PageFragment>()
}
public object HtmlReader : SnarkHtmlReader {
override val types: Set<String> = setOf("html")
override val inputContentTypes: Set<String> = setOf("html")
override fun readFrom(source: String): PageFragment = PageFragment {
div {

View File

@ -1,9 +1,10 @@
package space.kscience.snark.html.document
import kotlinx.coroutines.launch
import kotlinx.coroutines.runBlocking
import kotlinx.html.body
import kotlinx.html.head
import kotlinx.html.title
import kotlinx.html.*
import space.kscience.dataforge.context.info
import space.kscience.dataforge.context.logger
import space.kscience.dataforge.context.request
import space.kscience.dataforge.data.*
import space.kscience.dataforge.meta.Laminate
@ -11,6 +12,9 @@ import space.kscience.dataforge.meta.Meta
import space.kscience.dataforge.meta.get
import space.kscience.dataforge.meta.string
import space.kscience.dataforge.names.Name
import space.kscience.dataforge.names.cutLast
import space.kscience.dataforge.names.endsWith
import space.kscience.dataforge.names.parseAsName
import space.kscience.snark.SnarkBuilder
import space.kscience.snark.SnarkContext
import space.kscience.snark.html.*
@ -26,7 +30,7 @@ public interface DocumentBuilder : SnarkContext {
public val documentMeta: Meta
public val documentData: DataTree<*>
public val data: DataTree<*>
public suspend fun fragment(fragment: Data<*>, overrideMeta: Meta? = null)
@ -43,10 +47,13 @@ public suspend fun DocumentBuilder.fragment(fragmentName: String) {
fragment(site.siteData[fragmentName] ?: error("Can't find data fragment for $fragmentName in site data."))
}
private class PageBasedDocumentBuilder(val page: PageContextWithData) : DocumentBuilder {
private class PageBasedDocumentBuilder(
val page: PageContextWithData,
private val dataRootName: Name,
) : DocumentBuilder {
override val documentName: Name get() = page.pageRoute
override val documentMeta: Meta get() = page.pageMeta
override val documentData: DataTree<*> get() = page.data
override val data: DataTree<*> = page.data.branch(dataRootName) ?: DataTree.EMPTY
val fragments = mutableListOf<PageFragment>()
@ -56,24 +63,36 @@ private class PageBasedDocumentBuilder(val page: PageContextWithData) : Document
override suspend fun fragment(fragment: DocumentFragment, overrideMeta: Meta?) {
when (fragment) {
is ImageDocumentFragment -> fragment {
figure("snark-figure") {
img(classes = "snark-image") {
src = fragment.path
alt = fragment.meta["alt"].string ?: ""
}
fragment.meta["caption"].string?.let { caption ->
figcaption("snark-figure-caption") { +caption }
}
}
}
is MarkupDocumentFragment -> {
val snarkHtml = page.context.request(SnarkHtml)
snarkHtml.parseMarkup(Name.EMPTY, fragment.text, fragment.meta)
}
is DataDocumentFragment -> {
val data = data[fragment.name]
?: error("Can't find data with name ${fragment.name} for $fragment")
fragment(data)
}
is ListDocumentFragment -> {
val meta = Laminate(overrideMeta, fragment.meta)
fragment.fragments.forEach { fragment(it, meta) }
}
is ImageDocumentFragment -> TODO()
is MarkupDocumentFragment -> {
val snarkHtml = page.context.request(SnarkHtml)
TODO()
}
is DataDocumentFragment -> {
val data = documentData[fragment.dataName]
?: error("Can't find data with name ${fragment.dataName} for $fragment")
fragment(data)
}
is LayoutDocumentFragment -> TODO()
is LayoutDocumentFragment -> TODO("Layouts are not implemented")
}
}
@ -82,7 +101,7 @@ private class PageBasedDocumentBuilder(val page: PageContextWithData) : Document
typeOf<PageFragment>() -> fragment(fragment.await() as PageFragment)
typeOf<DocumentFragment>() -> fragment(
fragment.await() as DocumentFragment,
Laminate(overrideMeta, documentData.meta)
Laminate(overrideMeta, data.meta)
)
typeOf<String>() -> fragment(
@ -98,11 +117,13 @@ private class PageBasedDocumentBuilder(val page: PageContextWithData) : Document
public fun SiteContextWithData.document(
documentName: Name,
documentMeta: Meta = Meta.EMPTY,
headers: MetaDataContent.() -> Unit = {},
block: suspend DocumentBuilder.() -> Unit,
): Unit = page(documentName, documentMeta) {
val documentBuilder = runBlocking { PageBasedDocumentBuilder(page).apply { block() } }
val documentBuilder = runBlocking { PageBasedDocumentBuilder(page, documentName).apply { block() } }
head {
title(documentMeta["title"].string ?: "Snark document")
headers()
}
body {
postprocess(FtlDocumentProcessor(this@document.context, documentBuilder)) {
@ -112,3 +133,57 @@ public fun SiteContextWithData.document(
}
}
}
public fun SiteContextWithData.document(
route: Name,
dataName: Name,
descriptor: DocumentDescriptor,
headers: MetaDataContent.() -> Unit = {},
): Unit = page(route, descriptor.documentMeta ?: Meta.EMPTY) {
val documentBuilder = runBlocking {
PageBasedDocumentBuilder(page, dataName).apply {
descriptor.fragments.forEach {
fragment(it)
}
}
}
head {
title(descriptor.title ?: "Snark document")
headers()
}
body {
h1("title") { +(descriptor.title ?: dataName.toString()) }
descriptor.authors.forEach {
div("author") {
div("author-name") { +it.name }
it.affiliation?.let { affiliation -> div("author-affiliation") { +affiliation } }
}
}
postprocess(FtlDocumentProcessor(this@document.context, documentBuilder)) {
documentBuilder.fragments.forEach {
fragment(it)
}
}
}
}
public fun SiteContextWithData.allDocuments(
headers: MetaDataContent.() -> Unit = {},
) {
siteData.forEach { documentData ->
if (documentData.type == typeOf<Meta>() && documentData.name.endsWith("document")) {
context.launch {
val descriptor = DocumentDescriptor.read(documentData.data.await() as Meta)
val directory = documentData.name.cutLast()
val route = descriptor.route?.parseAsName(false) ?: directory
context.logger.info { "Loading document $route" }
document(
route = route,
dataName = directory,
descriptor = descriptor,
headers = headers
)
}
}
}
}

View File

@ -0,0 +1,28 @@
package space.kscience.snark.html.document
import space.kscience.dataforge.meta.*
import space.kscience.dataforge.misc.DFExperimental
public class Author : Scheme() {
public var name: String by string { error("Name is required") }
public var affiliation: String? by string()
public companion object : SchemeSpec<Author>(::Author)
}
public class DocumentDescriptor : Scheme() {
public var route: String? by string()
public var title: String? by string()
public var documentMeta: Meta? by node()
public var authors: List<Author> by listOfScheme(Author)
@OptIn(DFExperimental::class)
public var fragments: List<DocumentFragment> by meta.listOfSerializable<DocumentFragment>()
public companion object : SchemeSpec<DocumentDescriptor>(::DocumentDescriptor)
}

View File

@ -1,19 +1,46 @@
package space.kscience.snark.html.document
import kotlinx.io.files.Path
import kotlinx.serialization.SerialName
import kotlinx.serialization.Serializable
import space.kscience.dataforge.meta.Meta
import space.kscience.dataforge.names.Name
public sealed interface DocumentFragment{
@Serializable
public sealed interface DocumentFragment {
public val meta: Meta
}
public class MarkupDocumentFragment(public val text: String, override val meta: Meta) : DocumentFragment
@Serializable
@SerialName("markup")
public class MarkupDocumentFragment(
public val text: String,
override val meta: Meta = Meta.EMPTY,
) : DocumentFragment
public class ImageDocumentFragment(public val image: Path, override val meta: Meta) : DocumentFragment
@Serializable
@SerialName("image")
public class ImageDocumentFragment(
public val path: String,
override val meta: Meta = Meta.EMPTY,
) : DocumentFragment
public class DataDocumentFragment(public val dataName: Name, override val meta: Meta) : DocumentFragment
@Serializable
@SerialName("data")
public class DataDocumentFragment(
public val name: Name,
override val meta: Meta = Meta.EMPTY,
) : DocumentFragment
public class ListDocumentFragment(public val fragments: List<DocumentFragment>, override val meta: Meta) : DocumentFragment
@Serializable
@SerialName("list")
public class ListDocumentFragment(
public val fragments: List<DocumentFragment>,
override val meta: Meta = Meta.EMPTY,
) : DocumentFragment
public class LayoutDocumentFragment(public val fragments: Map<String,DocumentFragment>, override val meta: Meta) : DocumentFragment
@Serializable
@SerialName("layout")
public class LayoutDocumentFragment(
public val fragments: Map<String, DocumentFragment>,
override val meta: Meta = Meta.EMPTY,
) : DocumentFragment