mirror of
https://github.com/MindWorkAI/AI-Studio.git
synced 2025-05-03 09:39:47 +00:00
Add endpoint to read and extract text from PDF files without streaming
This commit is contained in:
parent
ecaabfaa2a
commit
7d64767cd3
@ -0,0 +1,16 @@
|
|||||||
|
namespace AIStudio.Tools.Services;
|
||||||
|
|
||||||
|
public sealed partial class RustService
|
||||||
|
{
|
||||||
|
public async Task<string> GetPDFText(string filePath)
|
||||||
|
{
|
||||||
|
var response = await this.http.GetAsync($"/retrieval/fs/read/pdf?file_path={filePath}");
|
||||||
|
if (!response.IsSuccessStatusCode)
|
||||||
|
{
|
||||||
|
this.logger!.LogError($"Failed to read the PDF file due to an network error: '{response.StatusCode}'");
|
||||||
|
return string.Empty;
|
||||||
|
}
|
||||||
|
|
||||||
|
return await response.Content.ReadAsStringAsync();
|
||||||
|
}
|
||||||
|
}
|
@ -146,6 +146,30 @@ async fn stream_text_file(file_path: &str) -> Result<ChunkStream> {
|
|||||||
Ok(Box::pin(stream))
|
Ok(Box::pin(stream))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[get("/retrieval/fs/read/pdf?<file_path>")]
|
||||||
|
pub fn read_pdf(_token: APIToken, file_path: String) -> String {
|
||||||
|
let pdfium = Pdfium::default();
|
||||||
|
let doc = match pdfium.load_pdf_from_file(&file_path, None) {
|
||||||
|
Ok(document) => document,
|
||||||
|
Err(e) => return e.to_string(),
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut pdf_content = String::new();
|
||||||
|
for page in doc.pages().iter() {
|
||||||
|
let content = match page.text().map(|text_content| text_content.all()) {
|
||||||
|
Ok(content) => content,
|
||||||
|
Err(_) => {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
pdf_content.push_str(&content);
|
||||||
|
pdf_content.push_str("\n\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
pdf_content
|
||||||
|
}
|
||||||
|
|
||||||
async fn stream_pdf(file_path: &str) -> Result<ChunkStream> {
|
async fn stream_pdf(file_path: &str) -> Result<ChunkStream> {
|
||||||
let path = file_path.to_owned();
|
let path = file_path.to_owned();
|
||||||
let (tx, rx) = mpsc::channel(10);
|
let (tx, rx) = mpsc::channel(10);
|
||||||
|
@ -79,6 +79,7 @@ pub fn start_runtime_api() {
|
|||||||
crate::environment::get_config_directory,
|
crate::environment::get_config_directory,
|
||||||
crate::environment::read_user_language,
|
crate::environment::read_user_language,
|
||||||
crate::file_data::extract_data,
|
crate::file_data::extract_data,
|
||||||
|
crate::file_data::read_pdf,
|
||||||
crate::log::get_log_paths,
|
crate::log::get_log_paths,
|
||||||
])
|
])
|
||||||
.ignite().await.unwrap()
|
.ignite().await.unwrap()
|
||||||
|
Loading…
Reference in New Issue
Block a user