diff --git a/app/MindWork AI Studio/Tools/Services/RustService.Retrieval.cs b/app/MindWork AI Studio/Tools/Services/RustService.Retrieval.cs new file mode 100644 index 00000000..4a201564 --- /dev/null +++ b/app/MindWork AI Studio/Tools/Services/RustService.Retrieval.cs @@ -0,0 +1,16 @@ +namespace AIStudio.Tools.Services; + +public sealed partial class RustService +{ + public async Task GetPDFText(string filePath) + { + var response = await this.http.GetAsync($"/retrieval/fs/read/pdf?file_path={filePath}"); + if (!response.IsSuccessStatusCode) + { + this.logger!.LogError($"Failed to read the PDF file due to an network error: '{response.StatusCode}'"); + return string.Empty; + } + + return await response.Content.ReadAsStringAsync(); + } +} \ No newline at end of file diff --git a/runtime/src/file_data.rs b/runtime/src/file_data.rs index 344a1785..d5349f71 100644 --- a/runtime/src/file_data.rs +++ b/runtime/src/file_data.rs @@ -146,6 +146,30 @@ async fn stream_text_file(file_path: &str) -> Result { Ok(Box::pin(stream)) } +#[get("/retrieval/fs/read/pdf?")] +pub fn read_pdf(_token: APIToken, file_path: String) -> String { + let pdfium = Pdfium::default(); + let doc = match pdfium.load_pdf_from_file(&file_path, None) { + Ok(document) => document, + Err(e) => return e.to_string(), + }; + + let mut pdf_content = String::new(); + for page in doc.pages().iter() { + let content = match page.text().map(|text_content| text_content.all()) { + Ok(content) => content, + Err(_) => { + continue + } + }; + + pdf_content.push_str(&content); + pdf_content.push_str("\n\n"); + } + + pdf_content +} + async fn stream_pdf(file_path: &str) -> Result { let path = file_path.to_owned(); let (tx, rx) = mpsc::channel(10); diff --git a/runtime/src/runtime_api.rs b/runtime/src/runtime_api.rs index bf3fa249..459fc936 100644 --- a/runtime/src/runtime_api.rs +++ b/runtime/src/runtime_api.rs @@ -79,6 +79,7 @@ pub fn start_runtime_api() { crate::environment::get_config_directory, crate::environment::read_user_language, crate::file_data::extract_data, + crate::file_data::read_pdf, crate::log::get_log_paths, ]) .ignite().await.unwrap()