mirror of
https://github.com/MindWorkAI/AI-Studio.git
synced 2025-07-04 03:42:56 +00:00
Added support for reading PowerPoint files (#506)
Co-authored-by: krut_ni <nils.kruthoff@dlr.de>
This commit is contained in:
parent
b6169e5bfb
commit
18b2e22725
@ -114,6 +114,7 @@
|
||||
<ThirdPartyComponent Name="async-stream" Developer="Carl Lerche, Taiki Endo & Open Source Community" LicenseName="MIT" LicenseUrl="https://github.com/tokio-rs/async-stream/blob/master/LICENSE" RepositoryUrl="https://github.com/tokio-rs/async-stream" UseCase="@T("This library is used to create asynchronous streams in Rust. It allows us to work with streams of data that can be produced asynchronously, making it easier to handle events or data that arrive over time. We use this, e.g., to stream arbitrary data from the file system to the embedding system.")"/>
|
||||
<ThirdPartyComponent Name="flexi_logger" Developer="emabee & Open Source Community" LicenseName="MIT" LicenseUrl="https://github.com/emabee/flexi_logger/blob/master/LICENSE-MIT" RepositoryUrl="https://github.com/emabee/flexi_logger" UseCase="@T("This Rust library is used to output the app's messages to the terminal. This is helpful during development and troubleshooting. This feature is initially invisible; when the app is started via the terminal, the messages become visible.")"/>
|
||||
<ThirdPartyComponent Name="rand" Developer="Rust developers & Open Source Community" LicenseName="MIT" LicenseUrl="https://github.com/rust-random/rand/blob/master/LICENSE-MIT" RepositoryUrl="https://github.com/rust-random/rand" UseCase="@T("We must generate random numbers, e.g., for securing the interprocess communication between the user interface and the runtime. The rand library is great for this purpose.")"/>
|
||||
<ThirdPartyComponent Name="pptx-to-md" Developer="Nils Kruthoff & Open Source Community" LicenseName="MIT" LicenseUrl="https://github.com/nilskruthoff/pptx-parser/blob/master/LICENCE-MIT" RepositoryUrl="https://github.com/nilskruthoff/pptx-parser" UseCase="@T("#TODO")"/>
|
||||
<ThirdPartyComponent Name="base64" Developer="Marshall Pierce, Alice Maz & Open Source Community" LicenseName="MIT" LicenseUrl="https://github.com/marshallpierce/rust-base64/blob/master/LICENSE-MIT" RepositoryUrl="https://github.com/marshallpierce/rust-base64" UseCase="@T("For some data transfers, we need to encode the data in base64. This Rust library is great for this purpose.")"/>
|
||||
<ThirdPartyComponent Name="Rust Crypto" Developer="Artyom Pavlov, Tony Arcieri, Brian Warner, Arthur Gautier, Vlad Filippov, Friedel Ziegelmayer, Nicolas Stalder & Open Source Community" LicenseName="MIT" LicenseUrl="https://github.com/RustCrypto/traits/blob/master/cipher/LICENSE-MIT" RepositoryUrl="https://github.com/RustCrypto" UseCase="@T("When transferring sensitive data between Rust runtime and .NET app, we encrypt the data. We use some libraries from the Rust Crypto project for this purpose: cipher, aes, cbc, pbkdf2, hmac, and sha2. We are thankful for the great work of the Rust Crypto project.")"/>
|
||||
<ThirdPartyComponent Name="rcgen" Developer="RustTLS developers, est31 & Open Source Community" LicenseName="MIT" LicenseUrl="https://github.com/rustls/rcgen/blob/main/LICENSE" RepositoryUrl="https://github.com/rustls/rcgen" UseCase="@T("For the secure communication between the user interface and the runtime, we need to create certificates. This Rust library is great for this purpose.")"/>
|
||||
|
@ -38,6 +38,7 @@ calamine = "0.27.0"
|
||||
pdfium-render = "0.8.31"
|
||||
sys-locale = "0.3.2"
|
||||
cfg-if = "1.0.0"
|
||||
pptx-to-md = "0.3.0"
|
||||
|
||||
# Fixes security vulnerability downstream, where the upstream is not fixed yet:
|
||||
url = "2.5"
|
||||
|
@ -1,22 +1,24 @@
|
||||
use std::path::Path;
|
||||
use std::pin::Pin;
|
||||
use std::cmp::min;
|
||||
use crate::api_token::APIToken;
|
||||
use crate::pandoc::PandocProcessBuilder;
|
||||
use crate::pdfium::PdfiumInit;
|
||||
use async_stream::stream;
|
||||
use base64::{engine::general_purpose, Engine as _};
|
||||
use calamine::{open_workbook_auto, Reader};
|
||||
use file_format::{FileFormat, Kind};
|
||||
use futures::{Stream, StreamExt};
|
||||
use pdfium_render::prelude::Pdfium;
|
||||
use pptx_to_md::{ImageHandlingMode, ParserConfig, PptxContainer};
|
||||
use rocket::get;
|
||||
use rocket::response::stream::{Event, EventStream};
|
||||
use rocket::serde::Serialize;
|
||||
use rocket::tokio::select;
|
||||
use rocket::Shutdown;
|
||||
use std::path::Path;
|
||||
use std::pin::Pin;
|
||||
use tokio::io::AsyncBufReadExt;
|
||||
use tokio::sync::mpsc;
|
||||
use tokio_stream::wrappers::ReceiverStream;
|
||||
use rocket::Shutdown;
|
||||
use rocket::response::stream::{EventStream, Event};
|
||||
use rocket::tokio::select;
|
||||
use rocket::serde::Serialize;
|
||||
use rocket::get;
|
||||
use crate::api_token::APIToken;
|
||||
use crate::pandoc::PandocProcessBuilder;
|
||||
use crate::pdfium::PdfiumInit;
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct Chunk {
|
||||
@ -31,11 +33,30 @@ pub enum Metadata {
|
||||
Spreadsheet { sheet_name: String, row_number: usize },
|
||||
Document,
|
||||
Image,
|
||||
Presentation {
|
||||
slide_number: u32,
|
||||
image: Option<Base64Image>,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct Base64Image {
|
||||
pub id: String,
|
||||
pub content: String,
|
||||
pub segment: usize,
|
||||
pub is_end: bool
|
||||
}
|
||||
|
||||
impl Base64Image {
|
||||
fn new(id: String, content: String, segment: usize, is_end: bool) -> Self {
|
||||
Self { id, content, segment, is_end }
|
||||
}
|
||||
}
|
||||
|
||||
const TO_MARKDOWN: &str = "markdown";
|
||||
const DOCX: &str = "docx";
|
||||
const ODT: &str = "odt";
|
||||
const IMAGE_SEGMENT_SIZE_IN_CHARS: usize = 8_192; // equivalent to ~ 5500 token
|
||||
|
||||
type Result<T> = std::result::Result<T, Box<dyn std::error::Error + Send + Sync>>;
|
||||
type ChunkStream = Pin<Box<dyn Stream<Item = Result<Chunk>> + Send>>;
|
||||
@ -87,6 +108,8 @@ async fn stream_data(file_path: &str) -> Result<ChunkStream> {
|
||||
convert_with_pandoc(file_path, from, TO_MARKDOWN).await?
|
||||
}
|
||||
|
||||
"pptx" => stream_pptx(file_path).await?,
|
||||
|
||||
"xlsx" | "ods" | "xls" | "xlsm" | "xlsb" | "xla" | "xlam" => {
|
||||
stream_spreadsheet_as_csv(file_path).await?
|
||||
}
|
||||
@ -115,7 +138,7 @@ async fn stream_data(file_path: &str) -> Result<ChunkStream> {
|
||||
|
||||
Kind::Presentation => match fmt {
|
||||
FileFormat::OfficeOpenXmlPresentation => {
|
||||
convert_with_pandoc(file_path, fmt.extension(), TO_MARKDOWN).await?
|
||||
stream_pptx(file_path).await?
|
||||
}
|
||||
_ => stream_text_file(file_path).await?,
|
||||
},
|
||||
@ -295,3 +318,86 @@ async fn chunk_image(file_path: &str) -> Result<ChunkStream> {
|
||||
|
||||
Ok(Box::pin(stream))
|
||||
}
|
||||
|
||||
async fn stream_pptx(file_path: &str) -> Result<ChunkStream> {
|
||||
let path = Path::new(file_path).to_owned();
|
||||
|
||||
let parser_config = ParserConfig::builder()
|
||||
.extract_images(true)
|
||||
.compress_images(true)
|
||||
.quality(75)
|
||||
.image_handling_mode(ImageHandlingMode::Manually)
|
||||
.build();
|
||||
|
||||
let mut streamer = tokio::task::spawn_blocking(move || {
|
||||
PptxContainer::open(&path, parser_config).map_err(|e| Box::new(e) as Box<dyn std::error::Error + Send + Sync>)
|
||||
}).await??;
|
||||
|
||||
let (tx, rx) = mpsc::channel(32);
|
||||
|
||||
tokio::spawn(async move {
|
||||
for slide_result in streamer.iter_slides() {
|
||||
match slide_result {
|
||||
Ok(slide) => {
|
||||
if let Some(md_content) = slide.convert_to_md() {
|
||||
let chunk = Chunk {
|
||||
content: md_content,
|
||||
metadata: Metadata::Presentation {
|
||||
slide_number: slide.slide_number,
|
||||
image: None,
|
||||
},
|
||||
};
|
||||
|
||||
if tx.send(Ok(chunk)).await.is_err() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(images) = slide.load_images_manually() {
|
||||
for image in images.iter() {
|
||||
let base64_data = &image.base64_content;
|
||||
|
||||
let total_length = base64_data.len();
|
||||
let mut offset = 0;
|
||||
let mut segment_index = 0;
|
||||
|
||||
while offset < total_length {
|
||||
let end = min(offset + IMAGE_SEGMENT_SIZE_IN_CHARS, total_length);
|
||||
let segment_content = &base64_data[offset..end];
|
||||
let is_end = end == total_length;
|
||||
|
||||
let base64_image = Base64Image::new(
|
||||
image.img_ref.id.clone(),
|
||||
segment_content.to_string(),
|
||||
segment_index,
|
||||
is_end
|
||||
);
|
||||
|
||||
let chunk = Chunk {
|
||||
content: String::new(),
|
||||
metadata: Metadata::Presentation {
|
||||
slide_number: slide.slide_number,
|
||||
image: Some(base64_image),
|
||||
},
|
||||
};
|
||||
|
||||
if tx.send(Ok(chunk)).await.is_err() {
|
||||
break;
|
||||
}
|
||||
|
||||
offset = end;
|
||||
segment_index += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
Err(e) => {
|
||||
let _ = tx.send(Err(Box::new(e) as Box<dyn std::error::Error + Send + Sync>)).await;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
Ok(Box::pin(ReceiverStream::new(rx)))
|
||||
}
|
Loading…
Reference in New Issue
Block a user