mirror of
				https://github.com/MindWorkAI/AI-Studio.git
				synced 2025-11-04 04:20:20 +00:00 
			
		
		
		
	Add endpoint to read and extract text from PDF files without streaming
This commit is contained in:
		
							parent
							
								
									ecaabfaa2a
								
							
						
					
					
						commit
						7d64767cd3
					
				@ -0,0 +1,16 @@
 | 
			
		||||
namespace AIStudio.Tools.Services;
 | 
			
		||||
 | 
			
		||||
public sealed partial class RustService
 | 
			
		||||
{
 | 
			
		||||
    public async Task<string> GetPDFText(string filePath)
 | 
			
		||||
    {
 | 
			
		||||
        var response = await this.http.GetAsync($"/retrieval/fs/read/pdf?file_path={filePath}");
 | 
			
		||||
        if (!response.IsSuccessStatusCode)
 | 
			
		||||
        {
 | 
			
		||||
            this.logger!.LogError($"Failed to read the PDF file due to an network error: '{response.StatusCode}'");
 | 
			
		||||
            return string.Empty;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        return await response.Content.ReadAsStringAsync();
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
@ -146,6 +146,30 @@ async fn stream_text_file(file_path: &str) -> Result<ChunkStream> {
 | 
			
		||||
    Ok(Box::pin(stream))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#[get("/retrieval/fs/read/pdf?<file_path>")]
 | 
			
		||||
pub fn read_pdf(_token: APIToken, file_path: String) -> String {
 | 
			
		||||
    let pdfium = Pdfium::default();
 | 
			
		||||
    let doc = match pdfium.load_pdf_from_file(&file_path, None) {
 | 
			
		||||
        Ok(document) => document,
 | 
			
		||||
        Err(e) => return e.to_string(),
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    let mut pdf_content = String::new();
 | 
			
		||||
    for page in doc.pages().iter() {
 | 
			
		||||
        let content = match page.text().map(|text_content| text_content.all()) {
 | 
			
		||||
            Ok(content) => content,
 | 
			
		||||
            Err(_) => {
 | 
			
		||||
                continue
 | 
			
		||||
            }
 | 
			
		||||
        };
 | 
			
		||||
 | 
			
		||||
        pdf_content.push_str(&content);
 | 
			
		||||
        pdf_content.push_str("\n\n");
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    pdf_content
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
async fn stream_pdf(file_path: &str) -> Result<ChunkStream> {
 | 
			
		||||
    let path = file_path.to_owned();
 | 
			
		||||
    let (tx, rx) = mpsc::channel(10);
 | 
			
		||||
 | 
			
		||||
@ -79,6 +79,7 @@ pub fn start_runtime_api() {
 | 
			
		||||
                crate::environment::get_config_directory,
 | 
			
		||||
                crate::environment::read_user_language,
 | 
			
		||||
                crate::file_data::extract_data,
 | 
			
		||||
                crate::file_data::read_pdf,
 | 
			
		||||
                crate::log::get_log_paths,
 | 
			
		||||
            ])
 | 
			
		||||
            .ignite().await.unwrap()
 | 
			
		||||
 | 
			
		||||
		Loading…
	
		Reference in New Issue
	
	Block a user