From 31a7abea14379c343db41f525bf98b0e4d9ea51e Mon Sep 17 00:00:00 2001
From: Zixiao Wang <sw319346@gmail.com>
Date: Thu, 23 Jan 2025 21:50:55 +0800
Subject: [PATCH] updated ui

---
 .gitignore                                    |   2 +
 Dockerfile                                    |  31 +++
 README.md                                     |   2 +-
 __pycache__/main.cpython-312.pyc              | Bin 0 -> 769 bytes
 __pycache__/pipeline_setup.cpython-312.pyc    | Bin 0 -> 583 bytes
 endpoints/image.py                            |  53 ++++
 endpoints/text.py                             |  24 ++
 endpoints/video.py                            | 229 ++++++++++++++++++
 main.py                                       |  18 ++
 pipeline_setup.py                             |  19 ++
 requirements.txt                              | 117 +++++++++
 ui.py                                         |  63 +++++
 .../audio_transcription.cpython-312.pyc       | Bin 0 -> 895 bytes
 .../image_processing.cpython-312.pyc          | Bin 0 -> 763 bytes
 .../video_processing.cpython-312.pyc          | Bin 0 -> 5203 bytes
 utils/audio_transcription.py                  |  13 +
 utils/image_processing.py                     |  12 +
 utils/video_processing.py                     |  93 +++++++
 18 files changed, 675 insertions(+), 1 deletion(-)
 create mode 100644 Dockerfile
 create mode 100644 __pycache__/main.cpython-312.pyc
 create mode 100644 __pycache__/pipeline_setup.cpython-312.pyc
 create mode 100644 endpoints/image.py
 create mode 100644 endpoints/text.py
 create mode 100644 endpoints/video.py
 create mode 100644 main.py
 create mode 100644 pipeline_setup.py
 create mode 100644 requirements.txt
 create mode 100644 ui.py
 create mode 100644 utils/__pycache__/audio_transcription.cpython-312.pyc
 create mode 100644 utils/__pycache__/image_processing.cpython-312.pyc
 create mode 100644 utils/__pycache__/video_processing.cpython-312.pyc
 create mode 100644 utils/audio_transcription.py
 create mode 100644 utils/image_processing.py
 create mode 100644 utils/video_processing.py

diff --git a/.gitignore b/.gitignore
index 60d7971..f479596 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,3 +11,5 @@
 *.app
 .snapshots/*
 
+ __pycache__/*
+endpoints/__pycache__/*
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..9a038e2
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,31 @@
+FROM python:3.12-slim
+
+WORKDIR /home/ooin/st/app
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    build-essential \
+    libssl-dev \
+    libffi-dev \
+    libxml2-dev \
+    libxslt1-dev \
+    zlib1g-dev \
+    libjpeg-dev \
+    libopenblas-dev \
+    libopenmpi-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+COPY requirements.txt .
+
+RUN pip install --upgrade pip
+
+RUN pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 flash-attn==2.7.2.post1 -f https://download.pytorch.org/whl/torch_stable.html
+
+RUN pip install --no-cache-dir -r requirements.txt \
+    --index-url https://pypi.tuna.tsinghua.edu.cn/simple/ --timeout 100 --retries 5
+
+COPY . .
+
+EXPOSE 80
+
+CMD ["uvicorn", "pipeline_UI3+audio:app", "--host", "0.0.0.0", "--port", "80"]
\ No newline at end of file
diff --git a/README.md b/README.md
index cc8efb3..a1df0ab 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,2 @@
-# tiktok_AI
+# api
 
diff --git a/__pycache__/main.cpython-312.pyc b/__pycache__/main.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c2dbc978a7f30dd86fa455f7b14b6391ecb31217
GIT binary patch
literal 769
zcmaJ<J&4pm82x4^f16xxLA}#+g+oMin~k-Ia0tqAAQo0>LU6{r?B>r&_H<*jhs9mF
zR<^dX7M7NRrPz3f93u;gmEFR1R?cKMyCR2^FnRCGoA14k<dbciAlB!PFSkD+fS;07
zOPhg{CGqAJ5FjXo2)PJJTnlRvc5#Vus6?u(mRJe3NOyGvHK1rTYvCG#8J)9L>&26(
zBhg(hiYru}JzcF;8ahAsAGJPL&k+J@5~F(^fHlPw_0V!Hk<IE{n5Ou9CnmAT60zSd
zkCbU0LBHOww;I1BZ6U7T^|O5A!KQ#_PM_u8lOAP#!K@(ix9NiJ281S4TGSms4IC*i
z%o3K`87_s~@Vv+mV$Zu@7!us{)3nfUw96|Pt^+)5VXo{XS<aO-VL8_r4HKWVY_2~R
znS=wwA|qd>F-emk&a<|3WO5MRSQJZdrv6hcQ!x1ha4f8)hj0?9Vn3pu$MxQB@F-z1
z$E+8>ffFHw-|Nm!5>Y1+ZcdgvdC<+1uD7x2$O+nMpBuNLg!Dpso1GVW`Ps9lVw+%u
z&={IyxH^Vw;{Ku8FM<J{Xn<A+c;DDF4w{FsGVC0|<zuKymkwcd`1}a2oKpIZt%KqV
kUi+q;IaV5jd;42^TSN2p+Pib_$yephL<Qv{TM^!W01KhJVE_OC

literal 0
HcmV?d00001

diff --git a/__pycache__/pipeline_setup.cpython-312.pyc b/__pycache__/pipeline_setup.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6ed984458cf9e3d41ccda65187898c6704e9a3c5
GIT binary patch
literal 583
zcmYjNO>fgc5M6&IPVF>EZA$r2B@n8}p;*d+Q&k968by>;NL7)HkY#(f$tw1)vtA3F
zQ}_v-5d4Fx{RjL6PTbtRaDYp0mD&?0)(w@I!|a>4dNXh4d%eB{WIX!)Y4}Y6;8$7P
zg#8a(4kY*tAb?5)V#QTr)m3B7)f50#SBIKwr~oOb%qrBO0V}WyO<04*+0tos(dEoK
zRTs#u<aVn*FN>BY%mgPW!UT!tVa5U)V*<NmD2ZK4j`2|3>LG%d&oLznW?FySf7R)A
zy~Bf--NC!*y@Ldi-qF4vIDNtqBS-t~t@hIwTb(zrh33DX%yl^mKW_stmn(wd*Dqz9
zJi}lnEmVaaQ-~t(n6lXCLgk6DQk150pcf$`mSg{e7x+9Jc`2SspP?U)kXN#ZaF%-^
z%?KAO3p$8sIQCLBxz^f~%qQGSXoSPOrHJLg566f=sVSEm$mwPjLzF}`XPPv}O0Ue2
zNm&1FDU556LNBj%0xso0_|tMmG)4}km^dkSI39C4_Bws%=A_aF&ypueF3fGI&my$L
z)}^w%Lb@x{g{COVPq6m`>|InqV`FB?r?xpW&Z~_=ubr<vE}C}>%Pv;$6t~xljfchk
W2Ss!3s$s5Ht_;gmKk95v_W273Nw59@

literal 0
HcmV?d00001

diff --git a/endpoints/image.py b/endpoints/image.py
new file mode 100644
index 0000000..e46c558
--- /dev/null
+++ b/endpoints/image.py
@@ -0,0 +1,53 @@
+from fastapi import UploadFile, Form
+from fastapi.responses import JSONResponse
+import io
+from PIL import Image
+from pipeline_setup import pipe, IMAGE_TOKEN
+from utils.image_processing import encode_image_base64
+
+async def image_query(file: UploadFile, question: str = Form(...)):
+    """
+    API endpoint to process an image with the user's query.
+    """
+    try:
+        if file.content_type not in ["image/jpeg", "image/png"]:
+            return JSONResponse({"query": question, "error": "Unsupported file type."})
+
+        image_data = await file.read()
+        image = Image.open(io.BytesIO(image_data)).convert("RGB").resize((512, 512))
+        encoded_image_base64 = encode_image_base64(image)
+        
+        question_with_image_token = f"{question}\n{IMAGE_TOKEN}"
+        response = await asyncio.to_thread(pipe, (question, image))
+        return JSONResponse({"query": question, "response": response.text})
+    except Exception as e:
+        return JSONResponse({"query": question, "error": str(e)})
+
+
+# import mimetypes
+# async def image_query(file: UploadFile, question: str = Form(...)):
+#     """
+#     API endpoint to process an image with the user's query.
+#     """
+#     try:
+#         # Get the file path from the UploadFile object
+#         file_path = file.filename
+
+#         # Determine the file type using the file extension
+#         file_type, _ = mimetypes.guess_type(file_path)
+#         if file_type not in ["image/jpeg", "image/png"]:
+#             return {"query": question, "error": "Unsupported file type."}
+
+#         # Read the image file
+#         image_data = await file.read()
+#         image = Image.open(io.BytesIO(image_data)).convert("RGB").resize((512, 512))
+#         encoded_image_base64 = encode_image_base64(image)
+
+#         # Prepare the query with the image token
+#         question_with_image_token = f"{question}\n{IMAGE_TOKEN}"
+
+#         # Query the model
+#         response = await asyncio.to_thread(pipe, (question, image))
+#         return {"query": question, "response": response.text}
+#     except Exception as e:
+#         return {"query": question, "error": str(e)}
diff --git a/endpoints/text.py b/endpoints/text.py
new file mode 100644
index 0000000..f1afc03
--- /dev/null
+++ b/endpoints/text.py
@@ -0,0 +1,24 @@
+from fastapi import Form
+from fastapi.responses import JSONResponse
+from asyncio import to_thread
+from pipeline_setup import pipe
+
+async def text_query(question: str = Form(...)):
+    """
+    API endpoint to process text input with the user's query.
+    """
+    try:
+        response = await to_thread(pipe, question)
+        return JSONResponse({"query": question, "response": response.text})
+    except Exception as e:
+        return JSONResponse({"query": question, "error": str(e)})
+
+# async def text_query(question: str = Form(...)):
+#     """
+#     API endpoint to process text input with the user's query.
+#     """
+#     try:
+#         response = await to_thread(pipe, question)
+#         return {"query": question, "response": response.text}
+#     except Exception as e:
+#         return {"query": question, "error": str(e)}
diff --git a/endpoints/video.py b/endpoints/video.py
new file mode 100644
index 0000000..62fbc4a
--- /dev/null
+++ b/endpoints/video.py
@@ -0,0 +1,229 @@
+from fastapi import UploadFile, Form
+from fastapi.responses import JSONResponse
+from pipeline_setup import pipe
+from utils.image_processing import encode_image_base64
+from utils.video_processing import split_video_into_segments, extract_motion_key_frames, extract_audio_from_video
+from utils.audio_transcription import transcribe_audio
+import asyncio
+import time
+from concurrent.futures import ThreadPoolExecutor
+
+async def video_query(file: UploadFile, question: str = Form(...)):
+    """
+    API endpoint to process a video file with the user's query.
+    """
+    try:
+        print("Processing video...")
+
+        # Validate file type
+        if file.content_type not in ["video/mp4", "video/avi", "video/mkv"]:
+            return JSONResponse({"query": question, "error": "Unsupported video file type."})
+
+        # Start overall timer
+        overall_start_time = time.time()
+
+        # Save the uploaded video to a temporary file
+        print("Reading video...")
+        video_data = await file.read()
+        temp_video_path = "/tmp/temp_video.mp4"
+        with open(temp_video_path, "wb") as temp_video_file:
+            temp_video_file.write(video_data)
+        print(f"Temp video saved to: {temp_video_path}")
+
+        # Record the time after reading the video
+        video_reading_time = time.time()
+
+        # Split the video into segments
+        print("Splitting video...")
+        segments = split_video_into_segments(temp_video_path, segment_duration=30)
+        print(f"Video split into {len(segments)} segments.")
+
+        aggregated_responses = []
+        segment_timings = []
+
+        for i, segment_path in enumerate(segments):
+            print(f"Processing segment {i+1}/{len(segments)}: {segment_path}")
+
+            # Start timing for the segment
+            segment_start_time = time.time()
+
+            # Extract key frames
+            frame_start_time = time.time()
+            imgs = extract_motion_key_frames(segment_path, max_frames=50, sigma_multiplier=2)
+            frame_time = time.time()
+
+            # Extract audio and transcribe
+            audio_start_time = time.time()
+            audio_path = extract_audio_from_video(segment_path)
+            transcribed_text = transcribe_audio(audio_path)
+            audio_time = time.time()
+
+            # Combine transcribed text with the query
+            combined_query = f"Audio Transcript: {transcribed_text}\n{question}"
+
+            # Prepare content for the pipeline
+            question_with_frames = ""
+            for j, img in enumerate(imgs):
+                question_with_frames += f"Frame{j+1}: {{IMAGE_TOKEN}}\n"
+            question_with_frames += combined_query
+
+            content = [{"type": "text", "text": question_with_frames}]
+            for img in imgs:
+                content.append({
+                    "type": "image_url",
+                    "image_url": {
+                        "max_dynamic_patch": 1,
+                        "url": f"data:image/jpeg;base64,{encode_image_base64(img)}"
+                    }
+                })
+
+            # Query the model
+            inference_start_time = time.time()
+            messages = [dict(role="user", content=content)]
+            response = await asyncio.to_thread(pipe, messages)
+            inference_time = time.time()
+
+            # Aggregate response
+            aggregated_responses.append(response.text)
+
+            # Calculate timing for the segment
+            segment_timings.append({
+                "segment_index": i + 1,
+                "segment_processing_time": inference_time - segment_start_time,
+                "frame_extraction_time": frame_time - frame_start_time,
+                "audio_extraction_time": audio_time - audio_start_time,
+                "model_inference_time": inference_time - inference_start_time
+            })
+
+            print(f"transcription: {transcribed_text}")
+            # print(f"content: {content}")
+
+        overall_end_time = time.time()
+
+        # Aggregate total timings
+        total_timings = {
+            "video_reading_time": video_reading_time - overall_start_time,
+            "total_segments": len(segments),
+            "total_processing_time": overall_end_time - overall_start_time,
+            "segment_details": segment_timings
+        }
+
+        return JSONResponse({
+            "question": question,
+            "responses": aggregated_responses,
+            "timings": total_timings,
+        })
+    except Exception as e:
+        return JSONResponse({"query": question, "error": str(e)})
+    
+
+# async def video_query(file: UploadFile, question: str = Form(...)):
+#     """
+#     API endpoint to process a video file with the user's query.
+#     """
+#     try:
+#         print("Processing video...")
+
+#         # Get the file path from the UploadFile object
+#         file_path = file.filename
+
+#         # Determine the file type using the file extension
+#         file_type, _ = mimetypes.guess_type(file_path)
+#         if file_type is None or not file_type.startswith("video/"):
+#             return {"query": question, "error": "Unsupported video file type."}
+
+#         # Start overall timer
+#         overall_start_time = time.time()
+
+#         # Save the uploaded video to a temporary file
+#         print("Reading video...")
+#         video_data = await file.read()
+#         temp_video_path = "/tmp/temp_video.mp4"
+#         with open(temp_video_path, "wb") as temp_video_file:
+#             temp_video_file.write(video_data)
+#         print(f"Temp video saved to: {temp_video_path}")
+
+#         # Record the time after reading the video
+#         video_reading_time = time.time()
+
+#         # Split the video into segments
+#         print("Splitting video...")
+#         segments = split_video_into_segments(temp_video_path, segment_duration=30)
+#         print(f"Video split into {len(segments)} segments.")
+
+#         aggregated_responses = []
+#         segment_timings = []
+
+#         for i, segment_path in enumerate(segments):
+#             print(f"Processing segment {i+1}/{len(segments)}: {segment_path}")
+
+#             # Start timing for the segment
+#             segment_start_time = time.time()
+
+#             # Extract key frames
+#             frame_start_time = time.time()
+#             imgs = extract_motion_key_frames(segment_path, max_frames=50, sigma_multiplier=2)
+#             frame_time = time.time()
+
+#             # Extract audio and transcribe
+#             audio_start_time = time.time()
+#             audio_path = extract_audio_from_video(segment_path)
+#             transcribed_text = transcribe_audio(audio_path)
+#             audio_time = time.time()
+
+#             # Combine transcribed text with the query
+#             combined_query = f"Audio Transcript: {transcribed_text}\n{question}"
+
+#             # Prepare content for the pipeline
+#             question_with_frames = ""
+#             for j, img in enumerate(imgs):
+#                 question_with_frames += f"Frame{j+1}: {{IMAGE_TOKEN}}\n"
+#             question_with_frames += combined_query
+
+#             content = [{"type": "text", "text": question_with_frames}]
+#             for img in imgs:
+#                 content.append({
+#                     "type": "image_url",
+#                     "image_url": {
+#                         "max_dynamic_patch": 1,
+#                         "url": f"data:image/jpeg;base64,{encode_image_base64(img)}"
+#                     }
+#                 })
+
+#             # Query the model
+#             inference_start_time = time.time()
+#             messages = [dict(role="user", content=content)]
+#             response = await asyncio.to_thread(pipe, messages)
+#             inference_time = time.time()
+
+#             # Aggregate response
+#             aggregated_responses.append(response.text)
+
+#             # Calculate timing for the segment
+#             segment_timings.append({
+#                 "segment_index": i + 1,
+#                 "segment_processing_time": inference_time - segment_start_time,
+#                 "frame_extraction_time": frame_time - frame_start_time,
+#                 "audio_extraction_time": audio_time - audio_start_time,
+#                 "model_inference_time": inference_time - inference_start_time
+#             })
+
+#             print(f"transcription: {transcribed_text}")
+
+#         overall_end_time = time.time()
+
+#         # Aggregate total timings
+#         total_timings = {
+#             "video_reading_time": video_reading_time - overall_start_time,
+#             "total_segments": len(segments),
+#             "total_processing_time": overall_end_time - overall_start_time,
+#             "segment_details": segment_timings
+#         }
+
+#         return {
+#             "question": question,
+#             "responses": aggregated_responses,
+#             "timings": total_timings,
+#         }
+#     except Exception as e:
+#         return {"query": question, "error": str(e)}
\ No newline at end of file
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..97d8b51
--- /dev/null
+++ b/main.py
@@ -0,0 +1,18 @@
+from fastapi import FastAPI
+from endpoints.text import text_query
+from endpoints.image import image_query
+from endpoints.video import video_query
+
+app = FastAPI()
+
+# Register routes
+app.post("/api/text")(text_query)
+app.post("/api/image")(image_query)
+app.post("/api/video")(video_query)
+
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run("main:app", host="0.0.0.0", port=8080, reload=True)
+
+# python main.py
+# uvicorn main:app --reload
\ No newline at end of file
diff --git a/pipeline_setup.py b/pipeline_setup.py
new file mode 100644
index 0000000..e27eb31
--- /dev/null
+++ b/pipeline_setup.py
@@ -0,0 +1,19 @@
+from lmdeploy import pipeline, TurbomindEngineConfig, GenerationConfig
+
+# Constants
+IMAGE_TOKEN = "[IMAGE_TOKEN]"
+
+# Model initialization
+model = "OpenGVLab/InternVL2-26B-AWQ"
+pipe = pipeline(
+    model,
+    backend_config=TurbomindEngineConfig(
+        model_format="awq",
+        tp=4,
+        session_len=12864,
+        max_batch_size=1,
+        cache_max_entry_count=0.05,
+        cache_block_seq_len=32768,
+        quant_policy=4
+    )
+)
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..8806a97
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,117 @@
+accelerate==1.2.1
+addict==2.4.0
+aiohappyeyeballs==2.4.4
+aiohttp==3.11.11
+aiosignal==1.3.2
+annotated-types==0.7.0
+anyio==4.7.0
+attrs==24.3.0
+bitsandbytes==0.45.0
+certifi==2024.12.14
+charset-normalizer==3.4.0
+click==8.1.8
+cloudpickle==3.1.0
+datasets==3.2.0
+decord==0.6.0
+dill==0.3.8
+diskcache==5.6.3
+distro==1.9.0
+einops==0.8.0
+fastapi==0.115.6
+filelock==3.16.1
+fire==0.7.0
+# flash-attn==2.7.2.post1
+frozenlist==1.5.0
+fsspec==2024.9.0
+h11==0.14.0
+httpcore==1.0.7
+httpx==0.28.1
+huggingface-hub==0.27.0
+idna==3.10
+interegular==0.3.3
+Jinja2==3.1.5
+jiter==0.8.2
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+lark==1.2.2
+llvmlite==0.43.0
+lmdeploy==0.6.4
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+mdurl==0.1.2
+mmengine-lite==0.10.5
+modelscope==1.21.0
+mpmath==1.3.0
+multidict==6.1.0
+multiprocess==0.70.16
+nest-asyncio==1.6.0
+networkx==3.4.2
+ninja==1.11.1.3
+numba==0.60.0
+numpy==1.26.4
+nvidia-cublas-cu12==12.1.3.1
+nvidia-cuda-cupti-cu12==12.1.105
+nvidia-cuda-nvrtc-cu12==12.1.105
+nvidia-cuda-runtime-cu12==12.1.105
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.0.2.54
+nvidia-curand-cu12==10.3.2.106
+nvidia-cusolver-cu12==11.4.5.107
+nvidia-cusparse-cu12==12.1.0.106
+nvidia-ml-py==12.560.30
+nvidia-nccl-cu12==2.20.5
+nvidia-nvjitlink-cu12==12.6.85
+nvidia-nvtx-cu12==12.1.105
+openai==1.58.1
+opencv-python==4.10.0.84
+outlines==0.0.46
+packaging==24.2
+pandas==2.2.3
+peft==0.11.1
+pillow==11.0.0
+platformdirs==4.3.6
+propcache==0.2.1
+protobuf==5.29.2
+psutil==6.1.1
+pyairports==2.1.1
+pyarrow==18.1.0
+pycountry==24.6.1
+pydantic==2.10.4
+pydantic_core==2.27.2
+Pygments==2.18.0
+pynvml==12.0.0
+python-dateutil==2.9.0.post0
+python-multipart==0.0.20
+pytz==2024.2
+PyYAML==6.0.2
+referencing==0.35.1
+regex==2024.11.6
+requests==2.32.3
+rich==13.9.4
+rpds-py==0.22.3
+safetensors==0.4.5
+sentencepiece==0.2.0
+setuptools==75.6.0
+shortuuid==1.0.13
+six==1.17.0
+sniffio==1.3.1
+starlette==0.41.3
+sympy==1.13.3
+termcolor==2.5.0
+tiktoken==0.8.0
+timm==1.0.12
+tokenizers==0.21.0
+# torch==2.4.0
+# torchaudio==2.4.0
+# torchvision==0.19.0
+tqdm==4.67.1
+transformers==4.47.1
+triton==3.0.0
+typing_extensions==4.12.2
+tzdata==2024.2
+urllib3==2.3.0
+uvicorn==0.34.0
+wheel==0.45.1
+xxhash==3.5.0
+yapf==0.43.0
+yarl==1.18.3
diff --git a/ui.py b/ui.py
new file mode 100644
index 0000000..3263e18
--- /dev/null
+++ b/ui.py
@@ -0,0 +1,63 @@
+import gradio as gr
+import asyncio
+from endpoints.text import text_query
+from endpoints.image import image_query
+from endpoints.video import video_query
+
+def setup_ui():
+    with gr.Blocks() as ui:
+        gr.Markdown(
+            """
+            # Multimodal Query Interface
+            Submit text, image, or video queries and get insights powered by APIs.
+            """
+        )
+
+        # Tabbed layout
+        with gr.Tabs():
+            # Text Query Tab
+            with gr.Tab("Text Query"):
+                gr.Markdown("### Submit a Text Query")
+                with gr.Row():
+                    text_input = gr.Textbox(label="Your Question", placeholder="Type your question here...")
+                    text_button = gr.Button("Submit")
+                text_output = gr.Textbox(label="Response", interactive=False)
+                text_button.click(
+                    fn=lambda q: asyncio.run(text_query(q)),
+                    inputs=[text_input],
+                    outputs=[text_output]
+                )
+
+            # Image Query Tab
+            with gr.Tab("Image Query"):
+                gr.Markdown("### Submit an Image Query")
+                with gr.Row():
+                    image_input = gr.File(label="Upload Image")
+                    image_question_input = gr.Textbox(label="Your Question", placeholder="Type your question here...")
+                    image_button = gr.Button("Submit")
+                image_output = gr.Textbox(label="Response", interactive=False)
+                image_button.click(
+                    fn=lambda img, q: asyncio.run(image_query(img, q)),
+                    inputs=[image_input, image_question_input],
+                    outputs=[image_output]
+                )
+
+            # Video Query Tab
+            with gr.Tab("Video Query"):
+                gr.Markdown("### Submit a Video Query")
+                with gr.Row():
+                    video_input = gr.File(label="Upload Video")
+                    video_question_input = gr.Textbox(label="Your Question", placeholder="Type your question here...")
+                    video_button = gr.Button("Submit")
+                video_output = gr.Textbox(label="Response", interactive=False)
+                video_button.click(
+                    fn=lambda vid, q: asyncio.run(video_query(vid, q)),
+                    inputs=[video_input, video_question_input],
+                    outputs=[video_output]
+                )
+
+    return ui
+
+if __name__ == "__main__":
+    ui = setup_ui()
+    ui.launch(server_name="0.0.0.0", server_port=7860)
\ No newline at end of file
diff --git a/utils/__pycache__/audio_transcription.cpython-312.pyc b/utils/__pycache__/audio_transcription.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0165171116a04c7acb71f3beb6b189631736cc8c
GIT binary patch
literal 895
zcmZ8f&r2IY6rS0gL=$&gZP2Pz2_A$kG)ti;D}qo9E#k#nE@9(LbmMMznVm#aBBW3#
zy-6?jR6JVjq5nrOiUq^ao_gvnke+(#o84r!eeAyXX5Nqe-nZ|oY332g)0OwTA2C8d
z)nO!L=nPiDIYAUr>>(eQF;>3jX}(_8!Pgm~gpnGdnKz`Iq1p622Ip*h+{*n@tt>2!
z^(J+LZMN&PhJbV43mjVU1Ij#^Z@H8OmCzA;GRv80^2QE8N=3oZ;T@Dgogk6M>ROtU
zIvsRAj2~f&kF*Z1XVPHmbTq1c*3XCa7<II5WRa~`1zY%`Etnrx90j;^;IyO;3Ls}|
z0rwqY>0E)4xf&1rO3n3{%(BBU;3}d%B{OlU%*Q4kD^eB3V)4%tdoS>r9RzN}j)X1T
zeG%+e)?e67;d+ssR0!@gq8;vr!VMawuq|iVA>8eV1XCscaLgIM3w;$m(IZe@^xc@f
zHkK}prPGxwqtxB#O)Ym{_DJsS=IhPl`O}T=<~Q=NH!*!drUy`fxC!bXt`*Rj>u2EH
z=9=OgUV&4U^Pk*eUQ_(eK^$1sw<M`L5tBr~7?#ErUYQ>aS!F<*f*~`>Hoyi%O;3y{
z<9AeS@+LhYf!~K~wZYLcC^h-aaqG%hgqh8I*XF{dx$vQWWm?_my@{#g<#%frWR6cl
z>XtPoGhv%HtI2O1JJ@rhknseT9tj?k8u$N%;*tcy_$uIQdTfn9fHrE!qQ{{6I>z`1
hT7;)h2%b4B^bwdZg@H1CL&x){a0TY9@LQR<<zIX;(~|%I

literal 0
HcmV?d00001

diff --git a/utils/__pycache__/image_processing.cpython-312.pyc b/utils/__pycache__/image_processing.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..139c246ef9bd00a95b17ae4e23e067a2e4b52581
GIT binary patch
literal 763
zcmYLGL2DC16rS0gY?7`4+fqC<hxH(`hq^74mX;zIEH**3;4PQ1$xh;Kv)M8;TTGfl
z1VP1v;HCAXM<t&88(s_%B2zti@)k%>o}AfaHxK50-<xmdy_xsE<#G!^*xv5D#!U#o
z9~s1kW@K_-kaOSw2R1=@im(httTYv0EvpbXDpox$zBoq*VN^#>>IEvN+R)VgO6ida
zd8+8!4a`$+1RB9CB(0iwL1MDHD%RwyaOZ#}nx`aZ64IZsGuQ=QD%nm*7dr5Z(x2Ug
zfTd%}%fxJsUFhlaC}weu{_G`QE>`9kpLqgIwX?op2-|Pf0tXwmaaeq4M87szAi{!8
z@z$n68F5>UJJC^&N{9OoM0d1$Kzy5-X|A{dPZ!z@)8b=}sC|ODR*i1*Y;|iBN6C4n
zf!T@O429@mS<Y05>>tlm!+ISP>`?i2!}zyhJq>(p1%cbLD6^P*#Db&By`mK|w@IyN
z9F=1d)G(#;UB_*{JS|e0YSZFmNu-l~rUl`8;3@;IFTL%&?!4*t4t}ES%hL1Gt7jjz
zUg-zA{YzgQ>8k^M_2OWZUmN7thIwnKuaETYfxbP|cfO;YiO2#>jbsIoYr=6=w2R33
zb5*n`s`nX?WvMU9-OQdJIAIgtB{xJUM?!B2H&!8pmmqrya=*dK7-?|#BK&ar{&Wlk
L{Y0-ML}~s58lR-K

literal 0
HcmV?d00001

diff --git a/utils/__pycache__/video_processing.cpython-312.pyc b/utils/__pycache__/video_processing.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..99900a488ff0a472612bc04fc29c0d048746348c
GIT binary patch
literal 5203
zcmb7IU2Gf25#Hk+|Bfe867^%tu}oWzCHhB-t;BI<CxRkNvSis-WXG+QChYm{NRc9W
z?A=kah$G4X1|kCnETsV?BMB<E0iwWuaDV_c&_1|GfClIbS=y0smeCY>GSCM#G8&}l
zLuc=}B;z<hy99S<|7K@*X1|%eztq&Q2+~VE??i@c5c(QdNV1iP%_I<uNJJtPM-x^_
zO;BdtHeoYMdV+z{7PluH6AlWGq2tbkYr<vHOx&GdC)k8%!eiF$aV}9aQ3JGFbX=w;
zydo<)q4bHa%V?r@mI`_v;l)n}DZ@UT;3E<cTwhv@%4elWLP}~uo8dVh6(xCG;zda@
zXd#t`G4<yr6=)cf<@mrQNl0t5B9Luhh93CS@Zao#R@f4iwP_UYz$sQe(rQAwO|;#j
zZ<cx}q}fXyDCQtkrPR^Q61Oa^xk<ZD$MCvOB#j{rF^*7`lJFsAmia&Uc`JMfMfi|1
zYuD{MlL5K14v?>ojH{G4OC*INDC^Yxq(`il#6kG!)J=S@kdC7_;=t9VSyjkm4P~-S
z#G1=wnSisl>#p~Z==h*~;#s%o6kR$Pinf&u)Lmp%k*#W_%B>!e)7=p^%WB)moY*sj
zs+2klx~-|`R(HmBRC)=ey9r+kT}?7bcb-K-Z)TJYKsuX>M>RFT2d1$t2BJw#4ycxe
zssT9+9coGvqT#3{2E??&Yf(8FNUPCgBrrIbNJ)`SlTF6or6p2bCX*1Ee-7-~C0&ZD
zT1cK!umu{fQeP;;bS6^I8;)=oGs)NvL^JdORRuzWzhE$eoSMa#N0^j^sbH;PlU2i&
z;HM-ps;F>VrVkmMd4Kx(l$KT`g9h0RIwENX*WWi58XG?|78)EoYcy19<9(+FLj7l6
zJAKY@DpFkHRmrd`d@>>#Op4bg4JIZ>lLoD7iovSsiz!7GBo$Uu(n-U?r&3Z<40;UK
z+NL?9mTZU|j={=lEtS?nASZ~)rwlrrQVmX%H9j5+D||vy4X;%z&mqU6!m=8#Xdx|{
zkPMFS4oBmX;S}UVf=`MnzKU3NrS$KEU6XP`>XJe9E>-K&qEnhY73v%AN^8-$TGB~r
zE1<W|)U2^p#cB^(`@lLc>rB}T+%cl6|AH)sHe9H_JvXw>INm<_*2#Hx`P_YG?_&q@
zcvsn$0^73GQ)Kt#`q$Z-IpJF5O5|N(wP{bGX;0DHUSvC<!{0QoF3v2>ysxhAc(JhK
z#bWc3qW>qk(Y3~%%l(DM_T1?;#))}fKJ}HaE_Y(RY1>lY`|9$`#T|$8&maG?X<&|B
zZ{CVoEb0sT^2qIuJ3V*y-xc%Eoh&ww&T(IH4fEpC$QoDoZuflV*S%|8?YnJxkf6RD
z$oV}>`#0QnKRf4q%pte`n)ix#er8#`@7lNKs=emB;#=bGyPn<fqvk<sqX}^hxig!v
zb0n0}4JsH3P1OAd<QR1?BfzsV?&?Uxy=D?^<hE5%7h_edjJsqcLYkA%qOA%Y?Sz9)
z6HFp4Ps3q?bAW@6<LH`Gutktyd!q+1%&CD(v{LLTn5sJgF>1+(N+~iJsziGR?(C0@
zuvJEcZ3KZ9Y*&vlfSli3x4y!cP`XPr=WAuZ@ic@bT(j<~bE?MfGK!LsuzR+&d(c>x
z)!n*FXR)MV8_4DX**sYmyaA_`(VLv!3n*Jtg~c_oGV&6s;Vc;4L)Ic1$$CWx{*iVH
z{mKh+_^#Cow5bQs@z3ELv%YMt?gJ^@AO(kO+IGS@)=H>KsdMEO>Z(QtDkBJ0HL_l>
z(`|Zv25c<@-B~~EwLxnmBeiD<rE{PIMs&0wA~#tLyq{E(m@Uce)P1^t>NsEmb=6m8
zrwwF|PEElMp|@3QCo9A{2vsSotnq19;jFR;3`0+(Cne1`>J702PYi;p-bmyEe-{CN
z*Kii@0|o!AbB0jt3~ECMLFO6gxqvA6U43|ozJO*Zb1a>$&H83g(3u%vG0+ZN(iC0*
zPdO#c1}qeZLQVvT=e!8cL=1p)Ovt8>1XPuhYE+xG9HdYN3y=_-Jq|}UzkqBJ>6U@~
z520LDNKy??zy&9AKoGi>S9@xZN9nXki8hf26QD&qn6LAu8AXH1R9&!Gsm<bU<vJs}
zGA>w)go!u|qr)}`DcqU!=279}>gVG^I?ij7O3ve=q|HcDGN8@K0YOfxIJS*WOXMuA
zkU~yrQGt&K!f|=VIycPhbi`WwtKLT#W*_-*hT$~QZKlm$Gq`w!BaEO!!GLDCglVl`
zj?0SS?LRYmW<2!D(D<RD@xBYj7Fa+yV=XiY>kRTrQo6_sQ-&=G&RR_;f)0Z<RUAt3
z;Kbc2MVbyp6n@sI5z>kR!Mo+h4USaJP6kVDI4u3EIJm~b2LehE3sj4jHIF1@jmUIM
zmQ9uFA+W4ce9(C;F2iv3<<2K3;cKBrAu~Pm?#_ZMFz23sX}!6{bdIM!?OyF3D0B}L
zn+NB(bzl9eZ(qT;Z)Nzle)rrL-TARs^Jibn|MX}1iP!UQ{33r*DEh=XX1&h8*tF2J
z#4J<S-Nm|HbB;A%<Gh~twdGxH%05`lJYR^$XhFO52y0$B2-)divx3#2;Jq4d2)#lV
zc}<v9Ft{3YTuK^FUQMPGsaZ=Y4n<O<8Hl;mNj@bhhw$8PoMAs`FmZ?!4LZT6lo!l;
zLgLL}Of=jQaFHPTgfK4?O)1z63^%@iykhF5qT}(QR}>7a!6pTxs=+4sOC;>7Rilvv
zA4;U-S`-2xcy)O#J>$!&O!IN&B|OD$#zh8aU2|(shTW_|z-mTt$`L$}iW(k^0X7uX
zsIfE~0#0d&@*wuLUT}uD)B*d4wN>SqIXh%{O6}%O*-}N-V8a?b1Jy}6E*fmbE+EED
zMARmjY#P=NGRxAo{-|9kA(*-c{_5wT#2os@<+AU7+=ObjEU~xfmA;#fdt6s;V9mc}
zv3H^O!xvWSZuQ@`-yFH)Ew&96{U=uaV+H?M(LbIW{ig#>P+_WA6E0kk@?)=E6|Tju
z#FiSD8$a51W82EBH(GBWEbM&#w)Fd{Pp0nf{3Q7WRX9AH4~*RRp3Gm6a)bGrFocnH
z4U60Yw{(8F@A`#X^lC>>p`)i*do(w^-hJdw-5>k!I{t9-UiXP&{cxfC#0oz@yfnSc
zU!VP}KbY@6kvo~MAKq}F#^&6qb@#S=?(NH4*0*k7n*2@tdOW}H=$+S#t;5BwBXizw
zX~eev{lL+C4aYuy^M-rbu|mCbe&x;FsY1gskiy@%*uK!dbf8$@kvmClLrt-6Pi|zb
zc3Zx+^L9(2^=JXh1<TbpKs?;q4hMGX&f&W|@9w(OQ|uW0vURB7YMG<1^v(^=53G6W
z=7)=(?H{%+$9~s!d!)GQ<@NgJ#jb^}58FQ4e`Ehj`cq-`@KE9KP;nRD-CX|~*R;xQ
zFL2wJg!|mC4JX=hg4$pa*OX@(zx(DO@<yodzHuS{2=%QCHj{U6|IgPA$5d?Nf2!*p
zYGFRB^$zV|KHLAy&~EzkMsDai$DeoGhYmQu=XtQ#z?Wz9w~#HOik}e6k5BkSfJc}q
z_7pEu^@)J0nqLlk;hPAZwdqt0U+O0kJdl}NcH*aCrLvzA9lGs3^g$ID9s3k#X1v<{
z26+4h#d5snB`rNgQ?p|FVgMdT0hI@z!s7?6CM8m`!Yi|65<H?xkJWAFhnPU=p5Pa)
z`RbCy&d$!v4?E%Em5NImbeEpFn03e$rE`X#HfYEI935d<N$^^x^?TeQC?Il)0Zr}+
z1R2AgQUGxje7F{hh0GV8L%IYKYeCvVF1xvc!I}lQB5hK|ih)~i-SaKvo?8Pj8d@(p
zWfF#+g}?eLWI6QMh3sseZNJBKtaDpdxm^Wr*K$vh>&y+TGj6M6%`|q#{r2cvqw_7x
z1G&+A%wF8f?Oo|v*`H@$yvH0_^E84rnjRrYJ;J6Jq#xmLBc?l0dT~RY3H1toZ74W|
zHtd!IFd>~D8y>a5+it4NIs%6%39=$q`I35w5K6waGn~fHd)4$2=Ihi%F>{X}Bg;FP
z4%T%nA&cp_^s@3xSQZ<Wii5)qnxd!&h<$+E4^Z6$RR0h)K18j5Lv0Vy!A+)xYS=uo
Ok80YSpjfKOWcOb-WZ@?O

literal 0
HcmV?d00001

diff --git a/utils/audio_transcription.py b/utils/audio_transcription.py
new file mode 100644
index 0000000..82d4705
--- /dev/null
+++ b/utils/audio_transcription.py
@@ -0,0 +1,13 @@
+from pydub import AudioSegment
+from whisper import load_model
+
+def extract_audio_from_video(video_path: str) -> str:
+    audio = AudioSegment.from_file(video_path)
+    audio_path = "/tmp/temp_audio.wav"
+    audio.export(audio_path, format="wav")
+    return audio_path
+
+def transcribe_audio(audio_path: str) -> str:
+    model = load_model("base")
+    result = model.transcribe(audio_path)
+    return result["text"]
diff --git a/utils/image_processing.py b/utils/image_processing.py
new file mode 100644
index 0000000..3ce45e9
--- /dev/null
+++ b/utils/image_processing.py
@@ -0,0 +1,12 @@
+import io
+import base64
+from PIL import Image
+
+def encode_image_base64(image: Image.Image) -> str:
+    """
+    Encode a PIL Image to a Base64 string.
+    """
+    buffered = io.BytesIO()
+    image.save(buffered, format="JPEG")
+    return base64.b64encode(buffered.getvalue()).decode()
+    
diff --git a/utils/video_processing.py b/utils/video_processing.py
new file mode 100644
index 0000000..28c18fb
--- /dev/null
+++ b/utils/video_processing.py
@@ -0,0 +1,93 @@
+import cv2
+import os
+import subprocess
+import numpy as np
+from PIL import Image
+from pydub import AudioSegment
+from decord import VideoReader, cpu
+from concurrent.futures import ThreadPoolExecutor
+
+def split_video_into_segments(video_path, segment_duration=30):
+    """
+    Splits a video into segments of a specified duration using FFmpeg.
+    """
+    output_dir = "/tmp/video_segments"
+    os.makedirs(output_dir, exist_ok=True)
+
+    # Calculate total duration of the video
+    cap = cv2.VideoCapture(video_path)
+    fps = int(cap.get(cv2.CAP_PROP_FPS))
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    total_duration = total_frames / fps
+    cap.release()
+
+    segments = []
+    for start_time in range(0, int(total_duration), segment_duration):
+        segment_file = os.path.join(output_dir, f"segment_{start_time}.mp4")
+        command = [
+            "ffmpeg", "-i", video_path,
+            "-ss", str(start_time),
+            "-t", str(segment_duration),
+            "-c", "copy", segment_file
+        ]
+        subprocess.run(command, check=True)
+        segments.append(segment_file)
+
+    return segments
+
+def extract_motion_key_frames(video_path, max_frames=20, sigma_multiplier=2, frame_interval=1):
+    """
+    Extracts key frames from a video based on motion intensity.
+    """
+    def calculate_motion(frame_pair):
+        """
+        Calculates motion between two consecutive frames using optical flow.
+        """
+        prev_gray, current_frame = frame_pair
+        current_gray = cv2.cvtColor(current_frame, cv2.COLOR_BGR2GRAY)
+        flow = cv2.calcOpticalFlowFarneback(prev_gray, current_gray, None, 0.5, 3, 15, 3, 5, 1.2, 0)
+        motion = np.sum(flow ** 2)
+        return motion, current_gray
+
+    # Load video frames using Decord
+    video = VideoReader(video_path, ctx=cpu(0))
+    frames_batch = video.get_batch(range(0, len(video), frame_interval)).asnumpy()
+
+    # Resize frames for faster processing
+    frames = [cv2.resize(frame, (frame.shape[1] // 2, frame.shape[0] // 2)) for frame in frames_batch]
+
+    # Initialize the first frame
+    prev_gray = cv2.cvtColor(frames[0], cv2.COLOR_BGR2GRAY)
+    frame_pairs = [(prev_gray, frames[i]) for i in range(1, len(frames))]
+
+    # Calculate motion statistics
+    motion_values = []
+    with ThreadPoolExecutor() as executor:
+        motion_results = list(executor.map(calculate_motion, frame_pairs))
+    motion_values = [motion for motion, _ in motion_results]
+
+    # Calculate threshold statistically
+    motion_mean = np.mean(motion_values)
+    motion_std = np.std(motion_values)
+    threshold = motion_mean + sigma_multiplier * motion_std
+
+    # Extract key frames based on motion threshold
+    key_frames = []
+    for i, (motion, frame) in enumerate(zip(motion_values, frames[1:])):
+        if motion > threshold and len(key_frames) < max_frames:
+            img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+            key_frames.append(img)
+
+    return key_frames
+
+def extract_audio_from_video(video_path):
+    """
+    Extract audio from video using pydub and save as a temporary audio file.
+    """
+    print("Audio extraction started...")
+    audio = AudioSegment.from_file(video_path)
+    print("Audio extraction completed.")
+    audio_path = "/tmp/temp_audio.wav"
+    audio.export(audio_path, format="wav")
+    print(f"Audio extracted and saved to: {audio_path}")
+    return audio_path