Fixes in Envelope format and data tre

This commit is contained in:
Alexander Nozik 2023-03-27 09:45:51 +03:00
parent 29fa30fb51
commit 2c2f33427a
11 changed files with 98 additions and 56 deletions

View File

@ -11,6 +11,7 @@
- More fine-grained types in Action builders. - More fine-grained types in Action builders.
### Changed ### Changed
- `Name::replaceLast` API
- `PluginFactory` no longer requires plugin class - `PluginFactory` no longer requires plugin class
- Collection<Named> toMap -> associateByName - Collection<Named> toMap -> associateByName
- Simplified `DFTL` envelope format. Closing symbols are unnecessary. Properties are discontinued. - Simplified `DFTL` envelope format. Closing symbols are unnecessary. Properties are discontinued.
@ -36,6 +37,8 @@
### Removed ### Removed
### Fixed ### Fixed
- `readDataDirectory` does not split names with dots
- Front matter reader does not crash on non-UTF files
- Meta file name in readMeta from directory - Meta file name in readMeta from directory
- Tagless and FrontMatter envelope partial readers fix. - Tagless and FrontMatter envelope partial readers fix.

View File

@ -9,7 +9,7 @@ plugins {
allprojects { allprojects {
group = "space.kscience" group = "space.kscience"
version = "0.6.1-dev-5" version = "0.6.1-dev-6"
} }
subprojects { subprojects {

View File

@ -107,7 +107,7 @@ public fun <T : Any> DataSet<T>.startAll(coroutineScope: CoroutineScope): Job =
}.joinAll() }.joinAll()
} }
public suspend fun <T : Any> DataSet<T>.join(): Unit = coroutineScope { startAll(this).join() } public suspend fun <T : Any> DataSet<T>.computeAndJoinAll(): Unit = coroutineScope { startAll(this).join() }
public fun DataSet<*>.toMeta(): Meta = Meta { public fun DataSet<*>.toMeta(): Meta = Meta {
forEach { forEach {

View File

@ -1,6 +1,7 @@
package space.kscience.dataforge.data package space.kscience.dataforge.data
import space.kscience.dataforge.meta.Meta import space.kscience.dataforge.meta.Meta
import space.kscience.dataforge.misc.DFInternal
import space.kscience.dataforge.misc.Type import space.kscience.dataforge.misc.Type
import space.kscience.dataforge.names.* import space.kscience.dataforge.names.*
import kotlin.collections.component1 import kotlin.collections.component1
@ -65,11 +66,16 @@ public interface DataTree<out T : Any> : DataSet<T> {
*/ */
public val META_ITEM_NAME_TOKEN: NameToken = NameToken("@meta") public val META_ITEM_NAME_TOKEN: NameToken = NameToken("@meta")
public inline fun <reified T : Any> empty(meta: Meta = Meta.EMPTY): DataTree<T> = object : DataTree<T> { @DFInternal
public fun <T : Any> emptyWithType(type: KType, meta: Meta = Meta.EMPTY): DataTree<T> = object : DataTree<T> {
override val items: Map<NameToken, DataTreeItem<T>> get() = emptyMap() override val items: Map<NameToken, DataTreeItem<T>> get() = emptyMap()
override val dataType: KType get() = typeOf<T>() override val dataType: KType get() = type
override val meta: Meta get() = meta override val meta: Meta get() = meta
} }
@OptIn(DFInternal::class)
public inline fun <reified T : Any> empty(meta: Meta = Meta.EMPTY): DataTree<T> =
emptyWithType<T>(typeOf<T>(), meta)
} }
} }
@ -106,12 +112,8 @@ public fun <T : Any> DataTree<T>.traverseItems(): Sequence<Pair<Name, DataTreeIt
* Get a branch of this [DataTree] with a given [branchName]. * Get a branch of this [DataTree] with a given [branchName].
* The difference from similar method for [DataSet] is that internal logic is more simple and the return value is a [DataTree] * The difference from similar method for [DataSet] is that internal logic is more simple and the return value is a [DataTree]
*/ */
public fun <T : Any> DataTree<T>.branch(branchName: Name): DataTree<T> = object : DataTree<T> { @OptIn(DFInternal::class)
override val dataType: KType get() = this@branch.dataType public fun <T : Any> DataTree<T>.branch(branchName: Name): DataTree<T> =
getItem(branchName)?.tree ?: DataTree.emptyWithType(dataType)
override val meta: Meta public fun <T : Any> DataTree<T>.branch(branchName: String): DataTree<T> = branch(branchName.parseAsName())
get() = getItem(branchName)?.meta ?: Meta.EMPTY
override val items: Map<NameToken, DataTreeItem<T>>
get() = getItem(branchName).tree?.items ?: emptyMap()
}

View File

@ -68,8 +68,9 @@ public class FrontMatterEnvelopeFormat(
} }
override fun peekFormat(io: IOPlugin, binary: Binary): EnvelopeFormat? = binary.read { override fun peekFormat(io: IOPlugin, binary: Binary): EnvelopeFormat? = binary.read {
val line = readSafeUtf8Line() //read raw string to avoid UTF issues
return@read if (line.startsWith("---")) { val line = readRawString(3)
return@read if (line == "---") {
default default
} else { } else {
null null

View File

@ -144,6 +144,15 @@ public fun Name.firstOrNull(): NameToken? = tokens.firstOrNull()
*/ */
public fun Name.first(): NameToken = tokens.first() public fun Name.first(): NameToken = tokens.first()
/**
* Return a [Name] with its last token replaced via [replacement] rule.
* If initial [Name] is empty, return empty name.
*/
public fun Name.replaceLast(replacement: (NameToken) -> NameToken): Name {
if (isEmpty()) return Name.EMPTY
return cutLast() + replacement(lastOrNull()!!)
}
/** /**
* Convert the [String] to a [Name] by simply wrapping it in a single name token without parsing. * Convert the [String] to a [Name] by simply wrapping it in a single name token without parsing.
@ -227,7 +236,6 @@ public fun Name.removeFirstOrNull(first: Name): Name? = if (startsWith(first)) {
null null
} }
@ThreadLocal @ThreadLocal
private val nameCache = HashMap<String, Name>() private val nameCache = HashMap<String, Name>()

View File

@ -17,6 +17,7 @@ kscience{
} }
dependencies(jvmTest){ dependencies(jvmTest){
implementation(spclibs.logback.classic) implementation(spclibs.logback.classic)
implementation(projects.dataforgeIo.dataforgeIoYaml)
} }
} }

View File

@ -19,6 +19,7 @@ import space.kscience.dataforge.names.Name
import space.kscience.dataforge.names.NameToken import space.kscience.dataforge.names.NameToken
import space.kscience.dataforge.names.asName import space.kscience.dataforge.names.asName
import space.kscience.dataforge.names.plus import space.kscience.dataforge.names.plus
import space.kscience.dataforge.workspace.FileData.Companion.DEFAULT_IGNORE_EXTENSIONS
import java.nio.file.Files import java.nio.file.Files
import java.nio.file.Path import java.nio.file.Path
import java.nio.file.StandardWatchEventKinds import java.nio.file.StandardWatchEventKinds
@ -27,6 +28,7 @@ import java.nio.file.attribute.BasicFileAttributes
import java.nio.file.spi.FileSystemProvider import java.nio.file.spi.FileSystemProvider
import java.time.Instant import java.time.Instant
import kotlin.io.path.extension import kotlin.io.path.extension
import kotlin.io.path.name
import kotlin.io.path.nameWithoutExtension import kotlin.io.path.nameWithoutExtension
import kotlin.io.path.readAttributes import kotlin.io.path.readAttributes
import kotlin.reflect.KType import kotlin.reflect.KType
@ -54,6 +56,8 @@ public class FileData<T> internal constructor(private val data: Data<T>, public
public val FILE_EXTENSION_KEY: Name = FILE_KEY + "extension" public val FILE_EXTENSION_KEY: Name = FILE_KEY + "extension"
public val FILE_CREATE_TIME_KEY: Name = FILE_KEY + "created" public val FILE_CREATE_TIME_KEY: Name = FILE_KEY + "created"
public val FILE_UPDATE_TIME_KEY: Name = FILE_KEY + "updated" public val FILE_UPDATE_TIME_KEY: Name = FILE_KEY + "updated"
public const val DF_FILE_EXTENSION: String = "df"
public val DEFAULT_IGNORE_EXTENSIONS: Set<String> = setOf(DF_FILE_EXTENSION)
} }
} }
@ -88,13 +92,17 @@ public fun <T : Any> IOPlugin.readDataFile(
context(IOPlugin) @DFExperimental context(IOPlugin) @DFExperimental
private fun <T : Any> DataSetBuilder<T>.directory(path: Path, formatResolver: FileFormatResolver<T>) { private fun <T : Any> DataSetBuilder<T>.directory(
path: Path,
ignoreExtensions: Set<String>,
formatResolver: FileFormatResolver<T>,
) {
Files.list(path).forEach { childPath -> Files.list(path).forEach { childPath ->
val fileName = childPath.fileName.toString() val fileName = childPath.fileName.toString()
if (fileName.startsWith(IOPlugin.META_FILE_NAME)) { if (fileName.startsWith(IOPlugin.META_FILE_NAME)) {
meta(readMetaFile(childPath)) meta(readMetaFile(childPath))
} else if (!fileName.startsWith("@")) { } else if (!fileName.startsWith("@")) {
file(childPath, formatResolver) file(childPath, ignoreExtensions, formatResolver)
} }
} }
} }
@ -107,6 +115,7 @@ private fun <T : Any> DataSetBuilder<T>.directory(path: Path, formatResolver: Fi
public fun <T : Any> IOPlugin.readDataDirectory( public fun <T : Any> IOPlugin.readDataDirectory(
type: KType, type: KType,
path: Path, path: Path,
ignoreExtensions: Set<String> = DEFAULT_IGNORE_EXTENSIONS,
formatResolver: FileFormatResolver<T>, formatResolver: FileFormatResolver<T>,
): DataTree<T> { ): DataTree<T> {
//read zipped data node //read zipped data node
@ -116,14 +125,14 @@ public fun <T : Any> IOPlugin.readDataDirectory(
?: error("Zip file system provider not found") ?: error("Zip file system provider not found")
val fs = fsProvider.newFileSystem(path, mapOf("create" to "true")) val fs = fsProvider.newFileSystem(path, mapOf("create" to "true"))
return readDataDirectory(type, fs.rootDirectories.first(), formatResolver) return readDataDirectory(type, fs.rootDirectories.first(), ignoreExtensions, formatResolver)
} }
if (!Files.isDirectory(path)) error("Provided path $path is not a directory") if (!Files.isDirectory(path)) error("Provided path $path is not a directory")
return DataTree(type) { return DataTree(type) {
meta { meta {
FileData.FILE_PATH_KEY put path.toString() FileData.FILE_PATH_KEY put path.toString()
} }
directory(path, formatResolver) directory(path, ignoreExtensions, formatResolver)
} }
} }
@ -131,8 +140,9 @@ public fun <T : Any> IOPlugin.readDataDirectory(
@DFExperimental @DFExperimental
public inline fun <reified T : Any> IOPlugin.readDataDirectory( public inline fun <reified T : Any> IOPlugin.readDataDirectory(
path: Path, path: Path,
ignoreExtensions: Set<String> = DEFAULT_IGNORE_EXTENSIONS,
noinline formatResolver: FileFormatResolver<T>, noinline formatResolver: FileFormatResolver<T>,
): DataTree<T> = readDataDirectory(typeOf<T>(), path, formatResolver) ): DataTree<T> = readDataDirectory(typeOf<T>(), path, ignoreExtensions, formatResolver)
/** /**
* Read raw binary data tree from the directory. All files are read as-is (save for meta files). * Read raw binary data tree from the directory. All files are read as-is (save for meta files).
@ -140,7 +150,8 @@ public inline fun <reified T : Any> IOPlugin.readDataDirectory(
@DFExperimental @DFExperimental
public fun IOPlugin.readRawDirectory( public fun IOPlugin.readRawDirectory(
path: Path, path: Path,
): DataTree<Binary> = readDataDirectory(path) { _, _ -> IOReader.binary } ignoreExtensions: Set<String> = emptySet(),
): DataTree<Binary> = readDataDirectory(path, ignoreExtensions) { _, _ -> IOReader.binary }
private fun Path.toName() = Name(map { NameToken.parse(it.nameWithoutExtension) }) private fun Path.toName() = Name(map { NameToken.parse(it.nameWithoutExtension) })
@ -150,12 +161,13 @@ private fun Path.toName() = Name(map { NameToken.parse(it.nameWithoutExtension)
public fun <T : Any> IOPlugin.monitorDataDirectory( public fun <T : Any> IOPlugin.monitorDataDirectory(
type: KType, type: KType,
path: Path, path: Path,
ignoreExtensions: Set<String> = DEFAULT_IGNORE_EXTENSIONS,
formatResolver: FileFormatResolver<T>, formatResolver: FileFormatResolver<T>,
): DataSource<T> { ): DataSource<T> {
if (path.fileName.toString().endsWith(".zip")) error("Monitoring not supported for ZipFS") if (path.fileName.toString().endsWith(".zip")) error("Monitoring not supported for ZipFS")
if (!Files.isDirectory(path)) error("Provided path $path is not a directory") if (!Files.isDirectory(path)) error("Provided path $path is not a directory")
return DataSource(type, context) { return DataSource(type, context) {
directory(path, formatResolver) directory(path, ignoreExtensions, formatResolver)
launch(Dispatchers.IO) { launch(Dispatchers.IO) {
val watchService = path.fileSystem.newWatchService() val watchService = path.fileSystem.newWatchService()
@ -178,7 +190,7 @@ public fun <T : Any> IOPlugin.monitorDataDirectory(
if (fileName.startsWith(IOPlugin.META_FILE_NAME)) { if (fileName.startsWith(IOPlugin.META_FILE_NAME)) {
meta(readMetaFile(eventPath)) meta(readMetaFile(eventPath))
} else if (!fileName.startsWith("@")) { } else if (!fileName.startsWith("@")) {
file(eventPath, formatResolver) file(eventPath, ignoreExtensions, formatResolver)
} }
} }
} }
@ -197,8 +209,9 @@ public fun <T : Any> IOPlugin.monitorDataDirectory(
@DFExperimental @DFExperimental
public inline fun <reified T : Any> IOPlugin.monitorDataDirectory( public inline fun <reified T : Any> IOPlugin.monitorDataDirectory(
path: Path, path: Path,
ignoreExtensions: Set<String> = DEFAULT_IGNORE_EXTENSIONS,
noinline formatResolver: FileFormatResolver<T>, noinline formatResolver: FileFormatResolver<T>,
): DataSource<T> = monitorDataDirectory(typeOf<T>(), path, formatResolver) ): DataSource<T> = monitorDataDirectory(typeOf<T>(), path, ignoreExtensions, formatResolver)
/** /**
* Read and monitor raw binary data tree from the directory. All files are read as-is (save for meta files). * Read and monitor raw binary data tree from the directory. All files are read as-is (save for meta files).
@ -206,7 +219,8 @@ public inline fun <reified T : Any> IOPlugin.monitorDataDirectory(
@DFExperimental @DFExperimental
public fun IOPlugin.monitorRawDirectory( public fun IOPlugin.monitorRawDirectory(
path: Path, path: Path,
): DataSource<Binary> = monitorDataDirectory(path) { _, _ -> IOReader.binary } ignoreExtensions: Set<String> = DEFAULT_IGNORE_EXTENSIONS,
): DataSource<Binary> = monitorDataDirectory(path, ignoreExtensions) { _, _ -> IOReader.binary }
/** /**
* Write data tree to existing directory or create a new one using default [java.nio.file.FileSystem] provider * Write data tree to existing directory or create a new one using default [java.nio.file.FileSystem] provider
@ -248,14 +262,20 @@ public suspend fun <T : Any> IOPlugin.writeDataDirectory(
/** /**
* Add file/directory-based data tree item * Add file/directory-based data tree item
*
* @param ignoreExtensions a list of file extensions for which extension should be cut from the resulting item name
*/ */
context(IOPlugin) context(IOPlugin)
@OptIn(DFInternal::class) @OptIn(DFInternal::class)
@DFExperimental @DFExperimental
public fun <T : Any> DataSetBuilder<T>.file( public fun <T : Any> DataSetBuilder<T>.file(
path: Path, path: Path,
ignoreExtensions: Set<String> = DEFAULT_IGNORE_EXTENSIONS,
formatResolver: FileFormatResolver<out T>, formatResolver: FileFormatResolver<out T>,
) { ) {
fun defaultPath() = if (path.extension in ignoreExtensions) path.nameWithoutExtension else path.name
try { try {
//If path is a single file or a special directory, read it as single datum //If path is a single file or a special directory, read it as single datum
if (!Files.isDirectory(path) || Files.list(path).allMatch { it.fileName.toString().startsWith("@") }) { if (!Files.isDirectory(path) || Files.list(path).allMatch { it.fileName.toString().startsWith("@") }) {
@ -264,13 +284,13 @@ public fun <T : Any> DataSetBuilder<T>.file(
logger.warn { "File format is not resolved for $path. Skipping." } logger.warn { "File format is not resolved for $path. Skipping." }
return return
} }
val name = data.meta[Envelope.ENVELOPE_NAME_KEY].string ?: path.nameWithoutExtension val name: String = data.meta[Envelope.ENVELOPE_NAME_KEY].string ?: defaultPath()
data(name, data) data(name.asName(), data)
} else { } else {
//otherwise, read as directory //otherwise, read as directory
val data: DataTree<T> = readDataDirectory(dataType, path, formatResolver) val data: DataTree<T> = readDataDirectory(dataType, path, ignoreExtensions, formatResolver)
val name = data.meta[Envelope.ENVELOPE_NAME_KEY].string ?: path.nameWithoutExtension val name = data.meta[Envelope.ENVELOPE_NAME_KEY].string ?: defaultPath()
node(name, data) node(name.asName(), data)
} }
} catch (ex: Exception) { } catch (ex: Exception) {
logger.error { "Failed to read file or directory at $path: ${ex.message}" } logger.error { "Failed to read file or directory at $path: ${ex.message}" }

View File

@ -3,15 +3,16 @@ package space.kscience.dataforge.workspace
import io.ktor.utils.io.core.Input import io.ktor.utils.io.core.Input
import io.ktor.utils.io.core.Output import io.ktor.utils.io.core.Output
import kotlinx.coroutines.runBlocking import kotlinx.coroutines.runBlocking
import space.kscience.dataforge.context.Context
import space.kscience.dataforge.context.Global import space.kscience.dataforge.context.Global
import space.kscience.dataforge.data.* import space.kscience.dataforge.data.*
import space.kscience.dataforge.io.IOFormat import space.kscience.dataforge.io.*
import space.kscience.dataforge.io.io import space.kscience.dataforge.io.yaml.YamlPlugin
import space.kscience.dataforge.io.readUtf8String
import space.kscience.dataforge.io.writeUtf8String
import space.kscience.dataforge.meta.get import space.kscience.dataforge.meta.get
import space.kscience.dataforge.misc.DFExperimental import space.kscience.dataforge.misc.DFExperimental
import java.nio.file.Files import java.nio.file.Files
import kotlin.io.path.fileSize
import kotlin.io.path.toPath
import kotlin.reflect.KType import kotlin.reflect.KType
import kotlin.reflect.typeOf import kotlin.reflect.typeOf
import kotlin.test.Test import kotlin.test.Test
@ -44,32 +45,38 @@ class FileDataTest {
@Test @Test
@DFExperimental @DFExperimental
fun testDataWriteRead() { fun testDataWriteRead() = with(Global.io) {
Global.io.run { val dir = Files.createTempDirectory("df_data_node")
val dir = Files.createTempDirectory("df_data_node") runBlocking {
runBlocking { writeDataDirectory(dir, dataNode, StringIOFormat)
writeDataDirectory(dir, dataNode, StringIOFormat) println(dir.toUri().toString())
println(dir.toUri().toString()) val reconstructed = readDataDirectory(dir) { _, _ -> StringIOFormat }
val reconstructed = readDataDirectory(dir) { _, _ -> StringIOFormat } assertEquals(dataNode["dir.a"]?.meta?.get("content"), reconstructed["dir.a"]?.meta?.get("content"))
assertEquals(dataNode["dir.a"]?.meta?.get("content"), reconstructed["dir.a"]?.meta?.get("content")) assertEquals(dataNode["b"]?.await(), reconstructed["b"]?.await())
assertEquals(dataNode["b"]?.await(), reconstructed["b"]?.await())
}
} }
} }
@Test @Test
@DFExperimental @DFExperimental
fun testZipWriteRead() { fun testZipWriteRead() = with(Global.io) {
Global.io.run { val zip = Files.createTempFile("df_data_node", ".zip")
val zip = Files.createTempFile("df_data_node", ".zip") runBlocking {
runBlocking { dataNode.writeZip(zip, StringIOFormat)
dataNode.writeZip(zip, StringIOFormat) println(zip.toUri().toString())
println(zip.toUri().toString()) val reconstructed = readDataDirectory(zip) { _, _ -> StringIOFormat }
val reconstructed = readDataDirectory(zip) { _, _ -> StringIOFormat } assertEquals(dataNode["dir.a"]?.meta?.get("content"), reconstructed["dir.a"]?.meta?.get("content"))
assertEquals(dataNode["dir.a"]?.meta?.get("content"), reconstructed["dir.a"]?.meta?.get("content")) assertEquals(dataNode["b"]?.await(), reconstructed["b"]?.await())
assertEquals(dataNode["b"]?.await(), reconstructed["b"]?.await())
}
} }
} }
@Test
fun testNonEnvelope() {
val context = Context {
plugin(YamlPlugin)
}
val resource = javaClass.classLoader.getResource("SPC.png")!!
val data: Envelope = context.io.readEnvelopeFile(resource.toURI().toPath(), true)
assertEquals(resource.toURI().toPath().fileSize(), data.data?.size?.toLong())
}
} }

Binary file not shown.

After

Width:  |  Height:  |  Size: 22 KiB