From 07c1182611dac3364f7454c9e8f6893a4985c016 Mon Sep 17 00:00:00 2001 From: Thorsten Sommer Date: Tue, 1 Apr 2025 20:44:43 +0200 Subject: [PATCH] Housekeeping (#379) --- README.md | 2 +- app/MindWork AI Studio/Pages/About.razor | 7 +- app/MindWork AI Studio/Pages/Supporters.razor | 1 + .../wwwroot/changelog/v0.9.39.md | 3 +- runtime/Cargo.lock | 83 +++++++++++-------- runtime/Cargo.toml | 12 +-- runtime/src/file_data.rs | 23 ++--- 7 files changed, 78 insertions(+), 53 deletions(-) diff --git a/README.md b/README.md index 84f945a2..9c1b3a37 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ Things we are currently working on: - [x] ~~App: Configure embedding providers (PR [#224](https://github.com/MindWorkAI/AI-Studio/pull/224))~~ - [x] ~~App: Implement an [ERI](https://github.com/MindWorkAI/ERI) server coding assistant (PR [#231](https://github.com/MindWorkAI/AI-Studio/pull/231))~~ - [x] ~~App: Management of data sources (local & external data via [ERI](https://github.com/MindWorkAI/ERI)) (PR [#259](https://github.com/MindWorkAI/AI-Studio/pull/259), [#273](https://github.com/MindWorkAI/AI-Studio/pull/273))~~ - - [ ] Runtime: Extract data from txt / md / pdf / docx / xlsx files + - [x] ~~Runtime: Extract data from txt / md / pdf / docx / xlsx files (PR [#374](https://github.com/MindWorkAI/AI-Studio/pull/374))~~ - [ ] (*Optional*) Runtime: Implement internal embedding provider through [fastembed-rs](https://github.com/Anush008/fastembed-rs) - [ ] App: Implement external embedding providers - [ ] App: Implement the process to vectorize one local file using embeddings diff --git a/app/MindWork AI Studio/Pages/About.razor b/app/MindWork AI Studio/Pages/About.razor index 282e5e72..cfc2e559 100644 --- a/app/MindWork AI Studio/Pages/About.razor +++ b/app/MindWork AI Studio/Pages/About.razor @@ -102,12 +102,17 @@ - + + + + + + diff --git a/app/MindWork AI Studio/Pages/Supporters.razor b/app/MindWork AI Studio/Pages/Supporters.razor index 91a744a3..cffedf7d 100644 --- a/app/MindWork AI Studio/Pages/Supporters.razor +++ b/app/MindWork AI Studio/Pages/Supporters.razor @@ -79,6 +79,7 @@ + diff --git a/app/MindWork AI Studio/wwwroot/changelog/v0.9.39.md b/app/MindWork AI Studio/wwwroot/changelog/v0.9.39.md index 9f573df3..be69518e 100644 --- a/app/MindWork AI Studio/wwwroot/changelog/v0.9.39.md +++ b/app/MindWork AI Studio/wwwroot/changelog/v0.9.39.md @@ -1,7 +1,8 @@ -# v0.9.39, build 214 (2025-03-xx xx:xx UTC) +# v0.9.39, build 214 (2025-04-xx xx:xx UTC) - Added a feature flag for the plugin system. This flag is disabled by default and can be enabled inside the app settings. Please note that this feature is still in development; there are no plugins available yet. - Added the Lua library we use for the plugin system to the about page. - Added the plugin overview page. This page shows all installed plugins and allows you to enable or disable them. It is only available when the plugin preview feature is enabled. - Added hot reloading for plugins. When any plugin is changed, the app will automatically reload the plugin without needing to restart the app. +- Added an API for streaming arbitrary local files to the embedding process. Thanks Nils `nilskruthoff` for this great contribution. - Fixed the preview tooltip component not showing the correct position when used inside a scrollable container. - Upgraded to Rust 1.85.1 \ No newline at end of file diff --git a/runtime/Cargo.lock b/runtime/Cargo.lock index e16f6234..2ff68b78 100644 --- a/runtime/Cargo.lock +++ b/runtime/Cargo.lock @@ -131,9 +131,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "async-stream" -version = "0.3.5" +version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd56dd203fef61ac097dd65721a419ddccb106b2d2b70ba60a6b529f03961a51" +checksum = "0b5a71a6f37880a80d1d7f19efd781e4b5de42c88f0722cc13bcb6cc2cfe8476" dependencies = [ "async-stream-impl", "futures-core", @@ -142,9 +142,9 @@ dependencies = [ [[package]] name = "async-stream-impl" -version = "0.3.5" +version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" +checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d" dependencies = [ "proc-macro2", "quote", @@ -442,7 +442,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "138646b9af2c5d7f1804ea4bf93afc597737d2bd4f7341d67c48b03316976eb1" dependencies = [ "byteorder", - "chrono", "codepage", "encoding_rs", "log", @@ -476,6 +475,8 @@ version = "1.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "be714c154be609ec7f5dad223a33bf1482fff90472de28f7362806e6d4832b8c" dependencies = [ + "jobserver", + "libc", "shlex", ] @@ -1152,9 +1153,9 @@ dependencies = [ [[package]] name = "flexi_logger" -version = "0.29.8" +version = "0.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88a5a6882b2e137c4f2664562995865084eb5a00611fba30c582ef10354c4ad8" +checksum = "6807d19113a0dac26d3dae81ef5859058c341b31c71d0f8d0298b17327f21011" dependencies = [ "chrono", "log", @@ -1241,12 +1242,13 @@ dependencies = [ [[package]] name = "futures" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "645c6916888f6cb6350d2550b80fb63e734897a8498abe35cfb732b6487804b0" +checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876" dependencies = [ "futures-channel", "futures-core", + "futures-executor", "futures-io", "futures-sink", "futures-task", @@ -1255,9 +1257,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78" +checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10" dependencies = [ "futures-core", "futures-sink", @@ -1265,15 +1267,15 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d" +checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" [[package]] name = "futures-executor" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a576fc72ae164fca6b9db127eaa9a9dda0d61316034f33a0a0d4eda41f02b01d" +checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f" dependencies = [ "futures-core", "futures-task", @@ -1282,15 +1284,15 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1" +checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" [[package]] name = "futures-macro" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" +checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", @@ -1299,21 +1301,21 @@ dependencies = [ [[package]] name = "futures-sink" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5" +checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7" [[package]] name = "futures-task" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004" +checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" [[package]] name = "futures-util" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48" +checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" dependencies = [ "futures-channel", "futures-core", @@ -1467,15 +1469,16 @@ dependencies = [ ] [[package]] -name = "gif" -version = "0.13.1" +name = "getrandom" +version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fb2d69b19215e18bb912fa30f7ce15846e301408695e44e0ef719f1da9e19f2" +checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" dependencies = [ - "color_quant", - "weezl", + "cfg-if", + "libc", + "wasi 0.11.0+wasi-snapshot-preview1", ] - + [[package]] name = "getrandom" version = "0.3.1" @@ -1488,6 +1491,16 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "gif" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fb2d69b19215e18bb912fa30f7ce15846e301408695e44e0ef719f1da9e19f2" +dependencies = [ + "color_quant", + "weezl", +] + [[package]] name = "gimli" version = "0.29.0" @@ -2647,13 +2660,14 @@ version = "0.9.38" dependencies = [ "aes", "arboard", + "async-stream", "base64 0.22.1", "calamine", "cbc", - "chrono", "cipher", "file-format", "flexi_logger", + "futures", "hmac", "keyring", "log", @@ -2674,6 +2688,7 @@ dependencies = [ "tauri-build", "tauri-plugin-window-state", "tokio", + "tokio-stream", "url", ] @@ -5294,9 +5309,9 @@ dependencies = [ [[package]] name = "tokio-stream" -version = "0.1.15" +version = "0.1.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "267ac89e0bec6e691e5813911606935d77c476ff49024f98abcea3e7b15e37af" +checksum = "eca58d7bba4a75707817a2c44174253f9236b2d5fbd055602e9d5c07c139a047" dependencies = [ "futures-core", "pin-project-lite", diff --git a/runtime/Cargo.toml b/runtime/Cargo.toml index b23437e5..61e62fbb 100644 --- a/runtime/Cargo.toml +++ b/runtime/Cargo.toml @@ -16,7 +16,10 @@ serde_json = "1.0.134" keyring = { version = "3.6.2", features = ["apple-native", "windows-native", "sync-secret-service"] } arboard = "3.4.1" tokio = { version = "1.44", features = ["rt", "rt-multi-thread", "macros", "process"] } -flexi_logger = "0.29.8" +tokio-stream = "0.1.17" +futures = "0.3.31" +async-stream = "0.3.6" +flexi_logger = "0.30.0" log = { version = "0.4.26", features = ["kv"] } once_cell = "1.20.3" rocket = { version = "0.5.1", features = ["json", "tls"] } @@ -31,11 +34,8 @@ hmac = "0.12.1" sha2 = "0.10.8" rcgen = { version = "0.13.2", features = ["pem"] } file-format = "0.26.0" -calamine = "0.22" -pdfium-render = "0.8.27" -async-stream = "0.3" -futures = "0.3" -tokio-stream = "0.1" +calamine = "0.26.1" +pdfium-render = "0.8.29" # Fixes security vulnerability downstream, where the upstream is not fixed yet: url = "2.5" diff --git a/runtime/src/file_data.rs b/runtime/src/file_data.rs index d6a68400..29bc477a 100644 --- a/runtime/src/file_data.rs +++ b/runtime/src/file_data.rs @@ -10,7 +10,7 @@ use tokio::io::AsyncBufReadExt; use tokio::process::Command; use tokio::sync::mpsc; use tokio_stream::wrappers::ReceiverStream; -use rocket::{State, Shutdown}; +use rocket::Shutdown; use rocket::response::stream::{EventStream, Event}; use rocket::tokio::select; use rocket::serde::Serialize; @@ -38,11 +38,10 @@ const ODT: &str = "odt"; type Result = std::result::Result>; type ChunkStream = Pin> + Send>>; -#[get("/system/file-data/extract?")] +#[get("/retrieval/fs/extract?")] pub async fn extract_data(path: String, mut end: Shutdown) -> EventStream![] { EventStream! { let stream_result = stream_data(&path).await; - match stream_result { Ok(mut stream) => { loop { @@ -61,6 +60,7 @@ pub async fn extract_data(path: String, mut end: Shutdown) -> EventStream![] { yield Event::json(&chunk); } }, + Err(e) => { yield Event::json(&format!("Error starting stream: {}", e)); } @@ -74,21 +74,21 @@ async fn stream_data(file_path: &str) -> Result { } let file_path_clone = file_path.to_owned(); - let fmt = tokio::task::spawn_blocking(move || { FileFormat::from_file(&file_path_clone) }).await??; let ext = file_path.split('.').last().unwrap_or(""); - let stream = match ext { DOCX | ODT => { let from = if ext == DOCX { "docx" } else { "odt" }; convert_with_pandoc(file_path, from, TO_MARKDOWN).await? } + "xlsx" | "ods" | "xls" | "xlsm" | "xlsb" | "xla" | "xlam" => { stream_spreadsheet_as_csv(file_path).await? } + _ => match fmt.kind() { Kind::Document => match fmt { FileFormat::PortableDocumentFormat => read_pdf(file_path).await?, @@ -100,20 +100,24 @@ async fn stream_data(file_path: &str) -> Result { } _ => stream_text_file(file_path).await?, }, + Kind::Ebook => return Err("Ebooks not yet supported".into()), Kind::Image => chunk_image(file_path).await?, + Kind::Other => match fmt { FileFormat::HypertextMarkupLanguage => { convert_with_pandoc(file_path, fmt.extension(), TO_MARKDOWN).await? } _ => stream_text_file(file_path).await?, }, + Kind::Presentation => match fmt { FileFormat::OfficeOpenXmlPresentation => { convert_with_pandoc(file_path, fmt.extension(), TO_MARKDOWN).await? } _ => stream_text_file(file_path).await?, }, + Kind::Spreadsheet => stream_spreadsheet_as_csv(file_path).await?, _ => stream_text_file(file_path).await?, }, @@ -156,7 +160,7 @@ async fn read_pdf(file_path: &str) -> Result { }; for (i, page) in doc.pages().iter().enumerate() { - let content = match page.text().and_then(|t| Ok(t.all())) { + let content = match page.text().map(|t| t.all()) { Ok(c) => c, Err(e) => { let _ = tx.blocking_send(Err(e.into())); @@ -191,12 +195,11 @@ async fn stream_spreadsheet_as_csv(file_path: &str) -> Result { for sheet_name in workbook.sheet_names() { let range = match workbook.worksheet_range(&sheet_name) { - Some(Ok(r)) => r, - Some(Err(e)) => { + Ok(r) => r, + Err(e) => { let _ = tx.blocking_send(Err(e.into())); continue; } - None => continue, }; for (row_idx, row) in range.rows().enumerate() { @@ -228,7 +231,7 @@ async fn convert_with_pandoc( ) -> Result { let output = Command::new("pandoc") .arg(file_path) - .args(&["-f", from, "-t", to]) + .args(["-f", from, "-t", to]) .output() .await?;