diff --git a/README.md b/README.md
index 84f945a2..9c1b3a37 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@ Things we are currently working on:
- [x] ~~App: Configure embedding providers (PR [#224](https://github.com/MindWorkAI/AI-Studio/pull/224))~~
- [x] ~~App: Implement an [ERI](https://github.com/MindWorkAI/ERI) server coding assistant (PR [#231](https://github.com/MindWorkAI/AI-Studio/pull/231))~~
- [x] ~~App: Management of data sources (local & external data via [ERI](https://github.com/MindWorkAI/ERI)) (PR [#259](https://github.com/MindWorkAI/AI-Studio/pull/259), [#273](https://github.com/MindWorkAI/AI-Studio/pull/273))~~
- - [ ] Runtime: Extract data from txt / md / pdf / docx / xlsx files
+ - [x] ~~Runtime: Extract data from txt / md / pdf / docx / xlsx files (PR [#374](https://github.com/MindWorkAI/AI-Studio/pull/374))~~
- [ ] (*Optional*) Runtime: Implement internal embedding provider through [fastembed-rs](https://github.com/Anush008/fastembed-rs)
- [ ] App: Implement external embedding providers
- [ ] App: Implement the process to vectorize one local file using embeddings
diff --git a/app/MindWork AI Studio/Pages/About.razor b/app/MindWork AI Studio/Pages/About.razor
index 282e5e72..cfc2e559 100644
--- a/app/MindWork AI Studio/Pages/About.razor
+++ b/app/MindWork AI Studio/Pages/About.razor
@@ -102,12 +102,17 @@
-
+
+
+
+
+
+
diff --git a/app/MindWork AI Studio/Pages/Supporters.razor b/app/MindWork AI Studio/Pages/Supporters.razor
index 91a744a3..cffedf7d 100644
--- a/app/MindWork AI Studio/Pages/Supporters.razor
+++ b/app/MindWork AI Studio/Pages/Supporters.razor
@@ -79,6 +79,7 @@
+
diff --git a/app/MindWork AI Studio/wwwroot/changelog/v0.9.39.md b/app/MindWork AI Studio/wwwroot/changelog/v0.9.39.md
index 9f573df3..be69518e 100644
--- a/app/MindWork AI Studio/wwwroot/changelog/v0.9.39.md
+++ b/app/MindWork AI Studio/wwwroot/changelog/v0.9.39.md
@@ -1,7 +1,8 @@
-# v0.9.39, build 214 (2025-03-xx xx:xx UTC)
+# v0.9.39, build 214 (2025-04-xx xx:xx UTC)
- Added a feature flag for the plugin system. This flag is disabled by default and can be enabled inside the app settings. Please note that this feature is still in development; there are no plugins available yet.
- Added the Lua library we use for the plugin system to the about page.
- Added the plugin overview page. This page shows all installed plugins and allows you to enable or disable them. It is only available when the plugin preview feature is enabled.
- Added hot reloading for plugins. When any plugin is changed, the app will automatically reload the plugin without needing to restart the app.
+- Added an API for streaming arbitrary local files to the embedding process. Thanks Nils `nilskruthoff` for this great contribution.
- Fixed the preview tooltip component not showing the correct position when used inside a scrollable container.
- Upgraded to Rust 1.85.1
\ No newline at end of file
diff --git a/runtime/Cargo.lock b/runtime/Cargo.lock
index e16f6234..2ff68b78 100644
--- a/runtime/Cargo.lock
+++ b/runtime/Cargo.lock
@@ -131,9 +131,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50"
[[package]]
name = "async-stream"
-version = "0.3.5"
+version = "0.3.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cd56dd203fef61ac097dd65721a419ddccb106b2d2b70ba60a6b529f03961a51"
+checksum = "0b5a71a6f37880a80d1d7f19efd781e4b5de42c88f0722cc13bcb6cc2cfe8476"
dependencies = [
"async-stream-impl",
"futures-core",
@@ -142,9 +142,9 @@ dependencies = [
[[package]]
name = "async-stream-impl"
-version = "0.3.5"
+version = "0.3.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193"
+checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d"
dependencies = [
"proc-macro2",
"quote",
@@ -442,7 +442,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "138646b9af2c5d7f1804ea4bf93afc597737d2bd4f7341d67c48b03316976eb1"
dependencies = [
"byteorder",
- "chrono",
"codepage",
"encoding_rs",
"log",
@@ -476,6 +475,8 @@ version = "1.2.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "be714c154be609ec7f5dad223a33bf1482fff90472de28f7362806e6d4832b8c"
dependencies = [
+ "jobserver",
+ "libc",
"shlex",
]
@@ -1152,9 +1153,9 @@ dependencies = [
[[package]]
name = "flexi_logger"
-version = "0.29.8"
+version = "0.30.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "88a5a6882b2e137c4f2664562995865084eb5a00611fba30c582ef10354c4ad8"
+checksum = "6807d19113a0dac26d3dae81ef5859058c341b31c71d0f8d0298b17327f21011"
dependencies = [
"chrono",
"log",
@@ -1241,12 +1242,13 @@ dependencies = [
[[package]]
name = "futures"
-version = "0.3.30"
+version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "645c6916888f6cb6350d2550b80fb63e734897a8498abe35cfb732b6487804b0"
+checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876"
dependencies = [
"futures-channel",
"futures-core",
+ "futures-executor",
"futures-io",
"futures-sink",
"futures-task",
@@ -1255,9 +1257,9 @@ dependencies = [
[[package]]
name = "futures-channel"
-version = "0.3.30"
+version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78"
+checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10"
dependencies = [
"futures-core",
"futures-sink",
@@ -1265,15 +1267,15 @@ dependencies = [
[[package]]
name = "futures-core"
-version = "0.3.30"
+version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d"
+checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e"
[[package]]
name = "futures-executor"
-version = "0.3.30"
+version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a576fc72ae164fca6b9db127eaa9a9dda0d61316034f33a0a0d4eda41f02b01d"
+checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f"
dependencies = [
"futures-core",
"futures-task",
@@ -1282,15 +1284,15 @@ dependencies = [
[[package]]
name = "futures-io"
-version = "0.3.30"
+version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1"
+checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6"
[[package]]
name = "futures-macro"
-version = "0.3.30"
+version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac"
+checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650"
dependencies = [
"proc-macro2",
"quote",
@@ -1299,21 +1301,21 @@ dependencies = [
[[package]]
name = "futures-sink"
-version = "0.3.30"
+version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5"
+checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7"
[[package]]
name = "futures-task"
-version = "0.3.30"
+version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004"
+checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988"
[[package]]
name = "futures-util"
-version = "0.3.30"
+version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48"
+checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81"
dependencies = [
"futures-channel",
"futures-core",
@@ -1467,15 +1469,16 @@ dependencies = [
]
[[package]]
-name = "gif"
-version = "0.13.1"
+name = "getrandom"
+version = "0.2.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3fb2d69b19215e18bb912fa30f7ce15846e301408695e44e0ef719f1da9e19f2"
+checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7"
dependencies = [
- "color_quant",
- "weezl",
+ "cfg-if",
+ "libc",
+ "wasi 0.11.0+wasi-snapshot-preview1",
]
-
+
[[package]]
name = "getrandom"
version = "0.3.1"
@@ -1488,6 +1491,16 @@ dependencies = [
"windows-targets 0.52.6",
]
+[[package]]
+name = "gif"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3fb2d69b19215e18bb912fa30f7ce15846e301408695e44e0ef719f1da9e19f2"
+dependencies = [
+ "color_quant",
+ "weezl",
+]
+
[[package]]
name = "gimli"
version = "0.29.0"
@@ -2647,13 +2660,14 @@ version = "0.9.38"
dependencies = [
"aes",
"arboard",
+ "async-stream",
"base64 0.22.1",
"calamine",
"cbc",
- "chrono",
"cipher",
"file-format",
"flexi_logger",
+ "futures",
"hmac",
"keyring",
"log",
@@ -2674,6 +2688,7 @@ dependencies = [
"tauri-build",
"tauri-plugin-window-state",
"tokio",
+ "tokio-stream",
"url",
]
@@ -5294,9 +5309,9 @@ dependencies = [
[[package]]
name = "tokio-stream"
-version = "0.1.15"
+version = "0.1.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "267ac89e0bec6e691e5813911606935d77c476ff49024f98abcea3e7b15e37af"
+checksum = "eca58d7bba4a75707817a2c44174253f9236b2d5fbd055602e9d5c07c139a047"
dependencies = [
"futures-core",
"pin-project-lite",
diff --git a/runtime/Cargo.toml b/runtime/Cargo.toml
index b23437e5..61e62fbb 100644
--- a/runtime/Cargo.toml
+++ b/runtime/Cargo.toml
@@ -16,7 +16,10 @@ serde_json = "1.0.134"
keyring = { version = "3.6.2", features = ["apple-native", "windows-native", "sync-secret-service"] }
arboard = "3.4.1"
tokio = { version = "1.44", features = ["rt", "rt-multi-thread", "macros", "process"] }
-flexi_logger = "0.29.8"
+tokio-stream = "0.1.17"
+futures = "0.3.31"
+async-stream = "0.3.6"
+flexi_logger = "0.30.0"
log = { version = "0.4.26", features = ["kv"] }
once_cell = "1.20.3"
rocket = { version = "0.5.1", features = ["json", "tls"] }
@@ -31,11 +34,8 @@ hmac = "0.12.1"
sha2 = "0.10.8"
rcgen = { version = "0.13.2", features = ["pem"] }
file-format = "0.26.0"
-calamine = "0.22"
-pdfium-render = "0.8.27"
-async-stream = "0.3"
-futures = "0.3"
-tokio-stream = "0.1"
+calamine = "0.26.1"
+pdfium-render = "0.8.29"
# Fixes security vulnerability downstream, where the upstream is not fixed yet:
url = "2.5"
diff --git a/runtime/src/file_data.rs b/runtime/src/file_data.rs
index d6a68400..29bc477a 100644
--- a/runtime/src/file_data.rs
+++ b/runtime/src/file_data.rs
@@ -10,7 +10,7 @@ use tokio::io::AsyncBufReadExt;
use tokio::process::Command;
use tokio::sync::mpsc;
use tokio_stream::wrappers::ReceiverStream;
-use rocket::{State, Shutdown};
+use rocket::Shutdown;
use rocket::response::stream::{EventStream, Event};
use rocket::tokio::select;
use rocket::serde::Serialize;
@@ -38,11 +38,10 @@ const ODT: &str = "odt";
type Result = std::result::Result>;
type ChunkStream = Pin> + Send>>;
-#[get("/system/file-data/extract?")]
+#[get("/retrieval/fs/extract?")]
pub async fn extract_data(path: String, mut end: Shutdown) -> EventStream![] {
EventStream! {
let stream_result = stream_data(&path).await;
-
match stream_result {
Ok(mut stream) => {
loop {
@@ -61,6 +60,7 @@ pub async fn extract_data(path: String, mut end: Shutdown) -> EventStream![] {
yield Event::json(&chunk);
}
},
+
Err(e) => {
yield Event::json(&format!("Error starting stream: {}", e));
}
@@ -74,21 +74,21 @@ async fn stream_data(file_path: &str) -> Result {
}
let file_path_clone = file_path.to_owned();
-
let fmt = tokio::task::spawn_blocking(move || {
FileFormat::from_file(&file_path_clone)
}).await??;
let ext = file_path.split('.').last().unwrap_or("");
-
let stream = match ext {
DOCX | ODT => {
let from = if ext == DOCX { "docx" } else { "odt" };
convert_with_pandoc(file_path, from, TO_MARKDOWN).await?
}
+
"xlsx" | "ods" | "xls" | "xlsm" | "xlsb" | "xla" | "xlam" => {
stream_spreadsheet_as_csv(file_path).await?
}
+
_ => match fmt.kind() {
Kind::Document => match fmt {
FileFormat::PortableDocumentFormat => read_pdf(file_path).await?,
@@ -100,20 +100,24 @@ async fn stream_data(file_path: &str) -> Result {
}
_ => stream_text_file(file_path).await?,
},
+
Kind::Ebook => return Err("Ebooks not yet supported".into()),
Kind::Image => chunk_image(file_path).await?,
+
Kind::Other => match fmt {
FileFormat::HypertextMarkupLanguage => {
convert_with_pandoc(file_path, fmt.extension(), TO_MARKDOWN).await?
}
_ => stream_text_file(file_path).await?,
},
+
Kind::Presentation => match fmt {
FileFormat::OfficeOpenXmlPresentation => {
convert_with_pandoc(file_path, fmt.extension(), TO_MARKDOWN).await?
}
_ => stream_text_file(file_path).await?,
},
+
Kind::Spreadsheet => stream_spreadsheet_as_csv(file_path).await?,
_ => stream_text_file(file_path).await?,
},
@@ -156,7 +160,7 @@ async fn read_pdf(file_path: &str) -> Result {
};
for (i, page) in doc.pages().iter().enumerate() {
- let content = match page.text().and_then(|t| Ok(t.all())) {
+ let content = match page.text().map(|t| t.all()) {
Ok(c) => c,
Err(e) => {
let _ = tx.blocking_send(Err(e.into()));
@@ -191,12 +195,11 @@ async fn stream_spreadsheet_as_csv(file_path: &str) -> Result {
for sheet_name in workbook.sheet_names() {
let range = match workbook.worksheet_range(&sheet_name) {
- Some(Ok(r)) => r,
- Some(Err(e)) => {
+ Ok(r) => r,
+ Err(e) => {
let _ = tx.blocking_send(Err(e.into()));
continue;
}
- None => continue,
};
for (row_idx, row) in range.rows().enumerate() {
@@ -228,7 +231,7 @@ async fn convert_with_pandoc(
) -> Result {
let output = Command::new("pandoc")
.arg(file_path)
- .args(&["-f", from, "-t", to])
+ .args(["-f", from, "-t", to])
.output()
.await?;