2025-06-23 12:49:36 +00:00
|
|
|
|
use std::cmp::min;
|
|
|
|
|
use crate::api_token::APIToken;
|
|
|
|
|
use crate::pandoc::PandocProcessBuilder;
|
|
|
|
|
use crate::pdfium::PdfiumInit;
|
2025-04-01 17:10:29 +00:00
|
|
|
|
use async_stream::stream;
|
|
|
|
|
use base64::{engine::general_purpose, Engine as _};
|
|
|
|
|
use calamine::{open_workbook_auto, Reader};
|
|
|
|
|
use file_format::{FileFormat, Kind};
|
|
|
|
|
use futures::{Stream, StreamExt};
|
|
|
|
|
use pdfium_render::prelude::Pdfium;
|
2025-06-23 12:49:36 +00:00
|
|
|
|
use pptx_to_md::{ImageHandlingMode, ParserConfig, PptxContainer};
|
|
|
|
|
use rocket::get;
|
|
|
|
|
use rocket::response::stream::{Event, EventStream};
|
|
|
|
|
use rocket::serde::Serialize;
|
|
|
|
|
use rocket::tokio::select;
|
|
|
|
|
use rocket::Shutdown;
|
|
|
|
|
use std::path::Path;
|
|
|
|
|
use std::pin::Pin;
|
2025-04-01 17:10:29 +00:00
|
|
|
|
use tokio::io::AsyncBufReadExt;
|
|
|
|
|
use tokio::sync::mpsc;
|
|
|
|
|
use tokio_stream::wrappers::ReceiverStream;
|
|
|
|
|
|
|
|
|
|
#[derive(Debug, Serialize)]
|
|
|
|
|
pub struct Chunk {
|
|
|
|
|
pub content: String,
|
|
|
|
|
pub metadata: Metadata,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[derive(Debug, Serialize)]
|
|
|
|
|
pub enum Metadata {
|
|
|
|
|
Text { line_number: usize },
|
|
|
|
|
Pdf { page_number: usize },
|
|
|
|
|
Spreadsheet { sheet_name: String, row_number: usize },
|
|
|
|
|
Document,
|
|
|
|
|
Image,
|
2025-06-23 12:49:36 +00:00
|
|
|
|
Presentation {
|
|
|
|
|
slide_number: u32,
|
|
|
|
|
image: Option<Base64Image>,
|
|
|
|
|
},
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[derive(Debug, Serialize)]
|
|
|
|
|
pub struct Base64Image {
|
|
|
|
|
pub id: String,
|
|
|
|
|
pub content: String,
|
|
|
|
|
pub segment: usize,
|
|
|
|
|
pub is_end: bool
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl Base64Image {
|
|
|
|
|
fn new(id: String, content: String, segment: usize, is_end: bool) -> Self {
|
|
|
|
|
Self { id, content, segment, is_end }
|
|
|
|
|
}
|
2025-06-23 12:17:43 +00:00
|
|
|
|
Document {},
|
2025-06-24 08:20:31 +00:00
|
|
|
|
Image {},
|
2025-04-01 17:10:29 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const TO_MARKDOWN: &str = "markdown";
|
|
|
|
|
const DOCX: &str = "docx";
|
|
|
|
|
const ODT: &str = "odt";
|
2025-06-23 12:49:36 +00:00
|
|
|
|
const IMAGE_SEGMENT_SIZE_IN_CHARS: usize = 8_192; // equivalent to ~ 5500 token
|
2025-04-01 17:10:29 +00:00
|
|
|
|
|
|
|
|
|
type Result<T> = std::result::Result<T, Box<dyn std::error::Error + Send + Sync>>;
|
|
|
|
|
type ChunkStream = Pin<Box<dyn Stream<Item = Result<Chunk>> + Send>>;
|
|
|
|
|
|
2025-04-01 18:44:43 +00:00
|
|
|
|
#[get("/retrieval/fs/extract?<path>")]
|
2025-05-02 21:09:50 +00:00
|
|
|
|
pub async fn extract_data(_token: APIToken, path: String, mut end: Shutdown) -> EventStream![] {
|
2025-04-01 17:10:29 +00:00
|
|
|
|
EventStream! {
|
|
|
|
|
let stream_result = stream_data(&path).await;
|
|
|
|
|
match stream_result {
|
|
|
|
|
Ok(mut stream) => {
|
|
|
|
|
loop {
|
|
|
|
|
let chunk = select! {
|
|
|
|
|
chunk = stream.next() => match chunk {
|
|
|
|
|
Some(Ok(chunk)) => chunk,
|
|
|
|
|
Some(Err(e)) => {
|
|
|
|
|
yield Event::json(&format!("Error: {}", e));
|
|
|
|
|
break;
|
|
|
|
|
},
|
|
|
|
|
None => break,
|
|
|
|
|
},
|
|
|
|
|
_ = &mut end => break,
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
yield Event::json(&chunk);
|
|
|
|
|
}
|
|
|
|
|
},
|
2025-04-01 18:44:43 +00:00
|
|
|
|
|
2025-04-01 17:10:29 +00:00
|
|
|
|
Err(e) => {
|
|
|
|
|
yield Event::json(&format!("Error starting stream: {}", e));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async fn stream_data(file_path: &str) -> Result<ChunkStream> {
|
|
|
|
|
if !Path::new(file_path).exists() {
|
|
|
|
|
return Err("File does not exist.".into());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let file_path_clone = file_path.to_owned();
|
|
|
|
|
let fmt = tokio::task::spawn_blocking(move || {
|
|
|
|
|
FileFormat::from_file(&file_path_clone)
|
|
|
|
|
}).await??;
|
|
|
|
|
|
2025-05-02 21:09:50 +00:00
|
|
|
|
let ext = file_path.split('.').next_back().unwrap_or("");
|
2025-04-01 17:10:29 +00:00
|
|
|
|
let stream = match ext {
|
|
|
|
|
DOCX | ODT => {
|
|
|
|
|
let from = if ext == DOCX { "docx" } else { "odt" };
|
|
|
|
|
convert_with_pandoc(file_path, from, TO_MARKDOWN).await?
|
|
|
|
|
}
|
2025-06-23 12:49:36 +00:00
|
|
|
|
|
|
|
|
|
"pptx" => stream_pptx(file_path).await?,
|
2025-04-01 18:44:43 +00:00
|
|
|
|
|
2025-04-01 17:10:29 +00:00
|
|
|
|
"xlsx" | "ods" | "xls" | "xlsm" | "xlsb" | "xla" | "xlam" => {
|
|
|
|
|
stream_spreadsheet_as_csv(file_path).await?
|
|
|
|
|
}
|
2025-04-01 18:44:43 +00:00
|
|
|
|
|
2025-04-01 17:10:29 +00:00
|
|
|
|
_ => match fmt.kind() {
|
|
|
|
|
Kind::Document => match fmt {
|
2025-05-02 21:09:50 +00:00
|
|
|
|
FileFormat::PortableDocumentFormat => stream_pdf(file_path).await?,
|
2025-04-01 17:10:29 +00:00
|
|
|
|
FileFormat::MicrosoftWordDocument => {
|
|
|
|
|
convert_with_pandoc(file_path, "docx", TO_MARKDOWN).await?
|
|
|
|
|
}
|
|
|
|
|
FileFormat::OfficeOpenXmlDocument => {
|
|
|
|
|
convert_with_pandoc(file_path, fmt.extension(), TO_MARKDOWN).await?
|
|
|
|
|
}
|
|
|
|
|
_ => stream_text_file(file_path).await?,
|
|
|
|
|
},
|
2025-04-01 18:44:43 +00:00
|
|
|
|
|
2025-04-01 17:10:29 +00:00
|
|
|
|
Kind::Ebook => return Err("Ebooks not yet supported".into()),
|
|
|
|
|
Kind::Image => chunk_image(file_path).await?,
|
2025-04-01 18:44:43 +00:00
|
|
|
|
|
2025-04-01 17:10:29 +00:00
|
|
|
|
Kind::Other => match fmt {
|
|
|
|
|
FileFormat::HypertextMarkupLanguage => {
|
|
|
|
|
convert_with_pandoc(file_path, fmt.extension(), TO_MARKDOWN).await?
|
|
|
|
|
}
|
|
|
|
|
_ => stream_text_file(file_path).await?,
|
|
|
|
|
},
|
2025-04-01 18:44:43 +00:00
|
|
|
|
|
2025-04-01 17:10:29 +00:00
|
|
|
|
Kind::Presentation => match fmt {
|
|
|
|
|
FileFormat::OfficeOpenXmlPresentation => {
|
2025-06-23 12:49:36 +00:00
|
|
|
|
stream_pptx(file_path).await?
|
2025-04-01 17:10:29 +00:00
|
|
|
|
}
|
|
|
|
|
_ => stream_text_file(file_path).await?,
|
|
|
|
|
},
|
2025-04-01 18:44:43 +00:00
|
|
|
|
|
2025-04-01 17:10:29 +00:00
|
|
|
|
Kind::Spreadsheet => stream_spreadsheet_as_csv(file_path).await?,
|
|
|
|
|
_ => stream_text_file(file_path).await?,
|
|
|
|
|
},
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
Ok(Box::pin(stream))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async fn stream_text_file(file_path: &str) -> Result<ChunkStream> {
|
|
|
|
|
let file = tokio::fs::File::open(file_path).await?;
|
|
|
|
|
let reader = tokio::io::BufReader::new(file);
|
|
|
|
|
let mut lines = reader.lines();
|
|
|
|
|
let mut line_number = 0;
|
|
|
|
|
|
|
|
|
|
let stream = stream! {
|
2025-05-02 21:09:50 +00:00
|
|
|
|
while let Ok(Some(line)) = lines.next_line().await {
|
2025-04-01 17:10:29 +00:00
|
|
|
|
line_number += 1;
|
|
|
|
|
yield Ok(Chunk {
|
|
|
|
|
content: line,
|
|
|
|
|
metadata: Metadata::Text { line_number },
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
Ok(Box::pin(stream))
|
|
|
|
|
}
|
|
|
|
|
|
2025-05-02 21:09:50 +00:00
|
|
|
|
#[get("/retrieval/fs/read/pdf?<file_path>")]
|
|
|
|
|
pub fn read_pdf(_token: APIToken, file_path: String) -> String {
|
2025-05-03 10:20:22 +00:00
|
|
|
|
let pdfium = Pdfium::ai_studio_init();
|
2025-05-02 21:09:50 +00:00
|
|
|
|
let doc = match pdfium.load_pdf_from_file(&file_path, None) {
|
|
|
|
|
Ok(document) => document,
|
|
|
|
|
Err(e) => return e.to_string(),
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
let mut pdf_content = String::new();
|
|
|
|
|
for page in doc.pages().iter() {
|
|
|
|
|
let content = match page.text().map(|text_content| text_content.all()) {
|
|
|
|
|
Ok(content) => content,
|
|
|
|
|
Err(_) => {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
pdf_content.push_str(&content);
|
|
|
|
|
pdf_content.push_str("\n\n");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pdf_content
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async fn stream_pdf(file_path: &str) -> Result<ChunkStream> {
|
2025-04-01 17:10:29 +00:00
|
|
|
|
let path = file_path.to_owned();
|
|
|
|
|
let (tx, rx) = mpsc::channel(10);
|
|
|
|
|
|
|
|
|
|
tokio::task::spawn_blocking(move || {
|
2025-05-03 10:20:22 +00:00
|
|
|
|
let pdfium = Pdfium::ai_studio_init();
|
2025-04-01 17:10:29 +00:00
|
|
|
|
let doc = match pdfium.load_pdf_from_file(&path, None) {
|
2025-05-02 21:09:50 +00:00
|
|
|
|
Ok(document) => document,
|
2025-04-01 17:10:29 +00:00
|
|
|
|
Err(e) => {
|
|
|
|
|
let _ = tx.blocking_send(Err(e.into()));
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
2025-05-02 21:09:50 +00:00
|
|
|
|
for (num_page, page) in doc.pages().iter().enumerate() {
|
2025-04-01 18:44:43 +00:00
|
|
|
|
let content = match page.text().map(|t| t.all()) {
|
2025-05-02 21:09:50 +00:00
|
|
|
|
Ok(text_content) => text_content,
|
2025-04-01 17:10:29 +00:00
|
|
|
|
Err(e) => {
|
|
|
|
|
let _ = tx.blocking_send(Err(e.into()));
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
if tx.blocking_send(Ok(Chunk {
|
|
|
|
|
content,
|
2025-05-02 21:09:50 +00:00
|
|
|
|
metadata: Metadata::Pdf { page_number: num_page + 1 },
|
2025-04-01 17:10:29 +00:00
|
|
|
|
})).is_err() {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
Ok(Box::pin(ReceiverStream::new(rx)))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async fn stream_spreadsheet_as_csv(file_path: &str) -> Result<ChunkStream> {
|
|
|
|
|
let path = file_path.to_owned();
|
|
|
|
|
let (tx, rx) = mpsc::channel(10);
|
|
|
|
|
|
|
|
|
|
tokio::task::spawn_blocking(move || {
|
|
|
|
|
let mut workbook = match open_workbook_auto(&path) {
|
|
|
|
|
Ok(w) => w,
|
|
|
|
|
Err(e) => {
|
|
|
|
|
let _ = tx.blocking_send(Err(e.into()));
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
for sheet_name in workbook.sheet_names() {
|
|
|
|
|
let range = match workbook.worksheet_range(&sheet_name) {
|
2025-04-01 18:44:43 +00:00
|
|
|
|
Ok(r) => r,
|
|
|
|
|
Err(e) => {
|
2025-04-01 17:10:29 +00:00
|
|
|
|
let _ = tx.blocking_send(Err(e.into()));
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
for (row_idx, row) in range.rows().enumerate() {
|
|
|
|
|
let content = row.iter()
|
|
|
|
|
.map(|cell| cell.to_string())
|
|
|
|
|
.collect::<Vec<_>>()
|
|
|
|
|
.join(",");
|
|
|
|
|
|
|
|
|
|
if tx.blocking_send(Ok(Chunk {
|
|
|
|
|
content,
|
|
|
|
|
metadata: Metadata::Spreadsheet {
|
|
|
|
|
sheet_name: sheet_name.clone(),
|
|
|
|
|
row_number: row_idx + 1,
|
|
|
|
|
},
|
|
|
|
|
})).is_err() {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
Ok(Box::pin(ReceiverStream::new(rx)))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async fn convert_with_pandoc(
|
|
|
|
|
file_path: &str,
|
|
|
|
|
from: &str,
|
|
|
|
|
to: &str,
|
|
|
|
|
) -> Result<ChunkStream> {
|
2025-05-30 20:39:16 +00:00
|
|
|
|
let output = PandocProcessBuilder::new()
|
|
|
|
|
.with_input_file(file_path)
|
|
|
|
|
.with_input_format(from)
|
|
|
|
|
.with_output_format(to)
|
|
|
|
|
.build()
|
|
|
|
|
.command.output().await?;
|
|
|
|
|
|
2025-04-01 17:10:29 +00:00
|
|
|
|
let stream = stream! {
|
|
|
|
|
if output.status.success() {
|
|
|
|
|
match String::from_utf8(output.stdout.clone()) {
|
|
|
|
|
Ok(content) => yield Ok(Chunk {
|
|
|
|
|
content,
|
2025-06-23 12:17:43 +00:00
|
|
|
|
metadata: Metadata::Document {},
|
2025-04-01 17:10:29 +00:00
|
|
|
|
}),
|
|
|
|
|
Err(e) => yield Err(e.into()),
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
yield Err(format!(
|
|
|
|
|
"Pandoc error: {}",
|
|
|
|
|
String::from_utf8_lossy(&output.stderr)
|
|
|
|
|
).into());
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
Ok(Box::pin(stream))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async fn chunk_image(file_path: &str) -> Result<ChunkStream> {
|
|
|
|
|
let data = tokio::fs::read(file_path).await?;
|
|
|
|
|
let base64 = general_purpose::STANDARD.encode(&data);
|
|
|
|
|
|
|
|
|
|
let stream = stream! {
|
|
|
|
|
yield Ok(Chunk {
|
|
|
|
|
content: base64,
|
2025-06-24 08:20:31 +00:00
|
|
|
|
metadata: Metadata::Image {},
|
2025-04-01 17:10:29 +00:00
|
|
|
|
});
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
Ok(Box::pin(stream))
|
2025-06-23 12:49:36 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async fn stream_pptx(file_path: &str) -> Result<ChunkStream> {
|
|
|
|
|
let path = Path::new(file_path).to_owned();
|
|
|
|
|
|
|
|
|
|
let parser_config = ParserConfig::builder()
|
|
|
|
|
.extract_images(true)
|
|
|
|
|
.compress_images(true)
|
|
|
|
|
.quality(75)
|
|
|
|
|
.image_handling_mode(ImageHandlingMode::Manually)
|
|
|
|
|
.build();
|
|
|
|
|
|
|
|
|
|
let mut streamer = tokio::task::spawn_blocking(move || {
|
|
|
|
|
PptxContainer::open(&path, parser_config).map_err(|e| Box::new(e) as Box<dyn std::error::Error + Send + Sync>)
|
|
|
|
|
}).await??;
|
|
|
|
|
|
|
|
|
|
let (tx, rx) = mpsc::channel(32);
|
|
|
|
|
|
|
|
|
|
tokio::spawn(async move {
|
|
|
|
|
for slide_result in streamer.iter_slides() {
|
|
|
|
|
match slide_result {
|
|
|
|
|
Ok(slide) => {
|
|
|
|
|
if let Some(md_content) = slide.convert_to_md() {
|
|
|
|
|
let chunk = Chunk {
|
|
|
|
|
content: md_content,
|
|
|
|
|
metadata: Metadata::Presentation {
|
|
|
|
|
slide_number: slide.slide_number,
|
|
|
|
|
image: None,
|
|
|
|
|
},
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
if tx.send(Ok(chunk)).await.is_err() {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if let Some(images) = slide.load_images_manually() {
|
|
|
|
|
for image in images.iter() {
|
|
|
|
|
let base64_data = &image.base64_content;
|
|
|
|
|
|
|
|
|
|
let total_length = base64_data.len();
|
|
|
|
|
let mut offset = 0;
|
|
|
|
|
let mut segment_index = 0;
|
|
|
|
|
|
|
|
|
|
while offset < total_length {
|
|
|
|
|
let end = min(offset + IMAGE_SEGMENT_SIZE_IN_CHARS, total_length);
|
|
|
|
|
let segment_content = &base64_data[offset..end];
|
|
|
|
|
let is_end = end == total_length;
|
|
|
|
|
|
|
|
|
|
let base64_image = Base64Image::new(
|
|
|
|
|
image.img_ref.id.clone(),
|
|
|
|
|
segment_content.to_string(),
|
|
|
|
|
segment_index,
|
|
|
|
|
is_end
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
let chunk = Chunk {
|
|
|
|
|
content: String::new(),
|
|
|
|
|
metadata: Metadata::Presentation {
|
|
|
|
|
slide_number: slide.slide_number,
|
|
|
|
|
image: Some(base64_image),
|
|
|
|
|
},
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
if tx.send(Ok(chunk)).await.is_err() {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
offset = end;
|
|
|
|
|
segment_index += 1;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
Err(e) => {
|
|
|
|
|
let _ = tx.send(Err(Box::new(e) as Box<dyn std::error::Error + Send + Sync>)).await;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
Ok(Box::pin(ReceiverStream::new(rx)))
|
2025-04-01 17:10:29 +00:00
|
|
|
|
}
|