mirror of
				https://github.com/MindWorkAI/AI-Studio.git
				synced 2025-11-04 04:20:20 +00:00 
			
		
		
		
	Add endpoint to read and extract text from PDF files without streaming
This commit is contained in:
		
							parent
							
								
									ecaabfaa2a
								
							
						
					
					
						commit
						7d64767cd3
					
				@ -0,0 +1,16 @@
 | 
				
			|||||||
 | 
					namespace AIStudio.Tools.Services;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					public sealed partial class RustService
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					    public async Task<string> GetPDFText(string filePath)
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
					        var response = await this.http.GetAsync($"/retrieval/fs/read/pdf?file_path={filePath}");
 | 
				
			||||||
 | 
					        if (!response.IsSuccessStatusCode)
 | 
				
			||||||
 | 
					        {
 | 
				
			||||||
 | 
					            this.logger!.LogError($"Failed to read the PDF file due to an network error: '{response.StatusCode}'");
 | 
				
			||||||
 | 
					            return string.Empty;
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        return await response.Content.ReadAsStringAsync();
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
@ -146,6 +146,30 @@ async fn stream_text_file(file_path: &str) -> Result<ChunkStream> {
 | 
				
			|||||||
    Ok(Box::pin(stream))
 | 
					    Ok(Box::pin(stream))
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#[get("/retrieval/fs/read/pdf?<file_path>")]
 | 
				
			||||||
 | 
					pub fn read_pdf(_token: APIToken, file_path: String) -> String {
 | 
				
			||||||
 | 
					    let pdfium = Pdfium::default();
 | 
				
			||||||
 | 
					    let doc = match pdfium.load_pdf_from_file(&file_path, None) {
 | 
				
			||||||
 | 
					        Ok(document) => document,
 | 
				
			||||||
 | 
					        Err(e) => return e.to_string(),
 | 
				
			||||||
 | 
					    };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    let mut pdf_content = String::new();
 | 
				
			||||||
 | 
					    for page in doc.pages().iter() {
 | 
				
			||||||
 | 
					        let content = match page.text().map(|text_content| text_content.all()) {
 | 
				
			||||||
 | 
					            Ok(content) => content,
 | 
				
			||||||
 | 
					            Err(_) => {
 | 
				
			||||||
 | 
					                continue
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					        };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        pdf_content.push_str(&content);
 | 
				
			||||||
 | 
					        pdf_content.push_str("\n\n");
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    pdf_content
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
async fn stream_pdf(file_path: &str) -> Result<ChunkStream> {
 | 
					async fn stream_pdf(file_path: &str) -> Result<ChunkStream> {
 | 
				
			||||||
    let path = file_path.to_owned();
 | 
					    let path = file_path.to_owned();
 | 
				
			||||||
    let (tx, rx) = mpsc::channel(10);
 | 
					    let (tx, rx) = mpsc::channel(10);
 | 
				
			||||||
 | 
				
			|||||||
@ -79,6 +79,7 @@ pub fn start_runtime_api() {
 | 
				
			|||||||
                crate::environment::get_config_directory,
 | 
					                crate::environment::get_config_directory,
 | 
				
			||||||
                crate::environment::read_user_language,
 | 
					                crate::environment::read_user_language,
 | 
				
			||||||
                crate::file_data::extract_data,
 | 
					                crate::file_data::extract_data,
 | 
				
			||||||
 | 
					                crate::file_data::read_pdf,
 | 
				
			||||||
                crate::log::get_log_paths,
 | 
					                crate::log::get_log_paths,
 | 
				
			||||||
            ])
 | 
					            ])
 | 
				
			||||||
            .ignite().await.unwrap()
 | 
					            .ignite().await.unwrap()
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
		Reference in New Issue
	
	Block a user