Add endpoint to read and extract text from PDF files without streaming

This commit is contained in:
Thorsten Sommer 2025-05-02 15:37:19 +02:00
parent ecaabfaa2a
commit 7d64767cd3
Signed by: tsommer
GPG Key ID: 371BBA77A02C0108
3 changed files with 41 additions and 0 deletions

View File

@ -0,0 +1,16 @@
namespace AIStudio.Tools.Services;
public sealed partial class RustService
{
public async Task<string> GetPDFText(string filePath)
{
var response = await this.http.GetAsync($"/retrieval/fs/read/pdf?file_path={filePath}");
if (!response.IsSuccessStatusCode)
{
this.logger!.LogError($"Failed to read the PDF file due to an network error: '{response.StatusCode}'");
return string.Empty;
}
return await response.Content.ReadAsStringAsync();
}
}

View File

@ -146,6 +146,30 @@ async fn stream_text_file(file_path: &str) -> Result<ChunkStream> {
Ok(Box::pin(stream))
}
#[get("/retrieval/fs/read/pdf?<file_path>")]
pub fn read_pdf(_token: APIToken, file_path: String) -> String {
let pdfium = Pdfium::default();
let doc = match pdfium.load_pdf_from_file(&file_path, None) {
Ok(document) => document,
Err(e) => return e.to_string(),
};
let mut pdf_content = String::new();
for page in doc.pages().iter() {
let content = match page.text().map(|text_content| text_content.all()) {
Ok(content) => content,
Err(_) => {
continue
}
};
pdf_content.push_str(&content);
pdf_content.push_str("\n\n");
}
pdf_content
}
async fn stream_pdf(file_path: &str) -> Result<ChunkStream> {
let path = file_path.to_owned();
let (tx, rx) = mpsc::channel(10);

View File

@ -79,6 +79,7 @@ pub fn start_runtime_api() {
crate::environment::get_config_directory,
crate::environment::read_user_language,
crate::file_data::extract_data,
crate::file_data::read_pdf,
crate::log::get_log_paths,
])
.ignite().await.unwrap()