updated ui

2025-01-23 21:50:55 +08:00 · 2025-01-23 21:50:55 +08:00 · 31a7abea14
commit 31a7abea14
parent 0eb6bdc9c4
18 changed files with 675 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@ -11,3 +11,5 @@
 *.app
 .snapshots/*

+ __pycache__/*
+endpoints/__pycache__/*
--- a/31
+++ b/31
@ -0,0 +1,31 @@
+FROM python:3.12-slim
+
+WORKDIR /home/ooin/st/app
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    build-essential \
+    libssl-dev \
+    libffi-dev \
+    libxml2-dev \
+    libxslt1-dev \
+    zlib1g-dev \
+    libjpeg-dev \
+    libopenblas-dev \
+    libopenmpi-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+COPY requirements.txt .
+
+RUN pip install --upgrade pip
+
+RUN pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 flash-attn==2.7.2.post1 -f https://download.pytorch.org/whl/torch_stable.html
+
+RUN pip install --no-cache-dir -r requirements.txt \
+    --index-url https://pypi.tuna.tsinghua.edu.cn/simple/ --timeout 100 --retries 5
+
+COPY . .
+
+EXPOSE 80
+
+CMD ["uvicorn", "pipeline_UI3+audio:app", "--host", "0.0.0.0", "--port", "80"]
--- a/README.md
+++ b/README.md
@ -1,2 +1,2 @@
-# tiktok_AI
+# api

--- a/pycache/main.cpython-312.pyc
+++ b/pycache/main.cpython-312.pyc
--- a/pycache/pipeline_setup.cpython-312.pyc
+++ b/pycache/pipeline_setup.cpython-312.pyc
--- a/endpoints/image.py
+++ b/endpoints/image.py
@ -0,0 +1,53 @@
+from fastapi import UploadFile, Form
+from fastapi.responses import JSONResponse
+import io
+from PIL import Image
+from pipeline_setup import pipe, IMAGE_TOKEN
+from utils.image_processing import encode_image_base64
+
+async def image_query(file: UploadFile, question: str = Form(...)):
+    """
+    API endpoint to process an image with the user's query.
+    """
+    try:
+        if file.content_type not in ["image/jpeg", "image/png"]:
+            return JSONResponse({"query": question, "error": "Unsupported file type."})
+
+        image_data = await file.read()
+        image = Image.open(io.BytesIO(image_data)).convert("RGB").resize((512, 512))
+        encoded_image_base64 = encode_image_base64(image)
+        
+        question_with_image_token = f"{question}\n{IMAGE_TOKEN}"
+        response = await asyncio.to_thread(pipe, (question, image))
+        return JSONResponse({"query": question, "response": response.text})
+    except Exception as e:
+        return JSONResponse({"query": question, "error": str(e)})
+
+
+# import mimetypes
+# async def image_query(file: UploadFile, question: str = Form(...)):
+#     """
+#     API endpoint to process an image with the user's query.
+#     """
+#     try:
+#         # Get the file path from the UploadFile object
+#         file_path = file.filename
+
+#         # Determine the file type using the file extension
+#         file_type, _ = mimetypes.guess_type(file_path)
+#         if file_type not in ["image/jpeg", "image/png"]:
+#             return {"query": question, "error": "Unsupported file type."}
+
+#         # Read the image file
+#         image_data = await file.read()
+#         image = Image.open(io.BytesIO(image_data)).convert("RGB").resize((512, 512))
+#         encoded_image_base64 = encode_image_base64(image)
+
+#         # Prepare the query with the image token
+#         question_with_image_token = f"{question}\n{IMAGE_TOKEN}"
+
+#         # Query the model
+#         response = await asyncio.to_thread(pipe, (question, image))
+#         return {"query": question, "response": response.text}
+#     except Exception as e:
+#         return {"query": question, "error": str(e)}
--- a/endpoints/text.py
+++ b/endpoints/text.py
@ -0,0 +1,24 @@
+from fastapi import Form
+from fastapi.responses import JSONResponse
+from asyncio import to_thread
+from pipeline_setup import pipe
+
+async def text_query(question: str = Form(...)):
+    """
+    API endpoint to process text input with the user's query.
+    """
+    try:
+        response = await to_thread(pipe, question)
+        return JSONResponse({"query": question, "response": response.text})
+    except Exception as e:
+        return JSONResponse({"query": question, "error": str(e)})
+
+# async def text_query(question: str = Form(...)):
+#     """
+#     API endpoint to process text input with the user's query.
+#     """
+#     try:
+#         response = await to_thread(pipe, question)
+#         return {"query": question, "response": response.text}
+#     except Exception as e:
+#         return {"query": question, "error": str(e)}
--- a/endpoints/video.py
+++ b/endpoints/video.py
@ -0,0 +1,229 @@
+from fastapi import UploadFile, Form
+from fastapi.responses import JSONResponse
+from pipeline_setup import pipe
+from utils.image_processing import encode_image_base64
+from utils.video_processing import split_video_into_segments, extract_motion_key_frames, extract_audio_from_video
+from utils.audio_transcription import transcribe_audio
+import asyncio
+import time
+from concurrent.futures import ThreadPoolExecutor
+
+async def video_query(file: UploadFile, question: str = Form(...)):
+    """
+    API endpoint to process a video file with the user's query.
+    """
+    try:
+        print("Processing video...")
+
+        # Validate file type
+        if file.content_type not in ["video/mp4", "video/avi", "video/mkv"]:
+            return JSONResponse({"query": question, "error": "Unsupported video file type."})
+
+        # Start overall timer
+        overall_start_time = time.time()
+
+        # Save the uploaded video to a temporary file
+        print("Reading video...")
+        video_data = await file.read()
+        temp_video_path = "/tmp/temp_video.mp4"
+        with open(temp_video_path, "wb") as temp_video_file:
+            temp_video_file.write(video_data)
+        print(f"Temp video saved to: {temp_video_path}")
+
+        # Record the time after reading the video
+        video_reading_time = time.time()
+
+        # Split the video into segments
+        print("Splitting video...")
+        segments = split_video_into_segments(temp_video_path, segment_duration=30)
+        print(f"Video split into {len(segments)} segments.")
+
+        aggregated_responses = []
+        segment_timings = []
+
+        for i, segment_path in enumerate(segments):
+            print(f"Processing segment {i+1}/{len(segments)}: {segment_path}")
+
+            # Start timing for the segment
+            segment_start_time = time.time()
+
+            # Extract key frames
+            frame_start_time = time.time()
+            imgs = extract_motion_key_frames(segment_path, max_frames=50, sigma_multiplier=2)
+            frame_time = time.time()
+
+            # Extract audio and transcribe
+            audio_start_time = time.time()
+            audio_path = extract_audio_from_video(segment_path)
+            transcribed_text = transcribe_audio(audio_path)
+            audio_time = time.time()
+
+            # Combine transcribed text with the query
+            combined_query = f"Audio Transcript: {transcribed_text}\n{question}"
+
+            # Prepare content for the pipeline
+            question_with_frames = ""
+            for j, img in enumerate(imgs):
+                question_with_frames += f"Frame{j+1}: {{IMAGE_TOKEN}}\n"
+            question_with_frames += combined_query
+
+            content = [{"type": "text", "text": question_with_frames}]
+            for img in imgs:
+                content.append({
+                    "type": "image_url",
+                    "image_url": {
+                        "max_dynamic_patch": 1,
+                        "url": f"data:image/jpeg;base64,{encode_image_base64(img)}"
+                    }
+                })
+
+            # Query the model
+            inference_start_time = time.time()
+            messages = [dict(role="user", content=content)]
+            response = await asyncio.to_thread(pipe, messages)
+            inference_time = time.time()
+
+            # Aggregate response
+            aggregated_responses.append(response.text)
+
+            # Calculate timing for the segment
+            segment_timings.append({
+                "segment_index": i + 1,
+                "segment_processing_time": inference_time - segment_start_time,
+                "frame_extraction_time": frame_time - frame_start_time,
+                "audio_extraction_time": audio_time - audio_start_time,
+                "model_inference_time": inference_time - inference_start_time
+            })
+
+            print(f"transcription: {transcribed_text}")
+            # print(f"content: {content}")
+
+        overall_end_time = time.time()
+
+        # Aggregate total timings
+        total_timings = {
+            "video_reading_time": video_reading_time - overall_start_time,
+            "total_segments": len(segments),
+            "total_processing_time": overall_end_time - overall_start_time,
+            "segment_details": segment_timings
+        }
+
+        return JSONResponse({
+            "question": question,
+            "responses": aggregated_responses,
+            "timings": total_timings,
+        })
+    except Exception as e:
+        return JSONResponse({"query": question, "error": str(e)})
+    
+
+# async def video_query(file: UploadFile, question: str = Form(...)):
+#     """
+#     API endpoint to process a video file with the user's query.
+#     """
+#     try:
+#         print("Processing video...")
+
+#         # Get the file path from the UploadFile object
+#         file_path = file.filename
+
+#         # Determine the file type using the file extension
+#         file_type, _ = mimetypes.guess_type(file_path)
+#         if file_type is None or not file_type.startswith("video/"):
+#             return {"query": question, "error": "Unsupported video file type."}
+
+#         # Start overall timer
+#         overall_start_time = time.time()
+
+#         # Save the uploaded video to a temporary file
+#         print("Reading video...")
+#         video_data = await file.read()
+#         temp_video_path = "/tmp/temp_video.mp4"
+#         with open(temp_video_path, "wb") as temp_video_file:
+#             temp_video_file.write(video_data)
+#         print(f"Temp video saved to: {temp_video_path}")
+
+#         # Record the time after reading the video
+#         video_reading_time = time.time()
+
+#         # Split the video into segments
+#         print("Splitting video...")
+#         segments = split_video_into_segments(temp_video_path, segment_duration=30)
+#         print(f"Video split into {len(segments)} segments.")
+
+#         aggregated_responses = []
+#         segment_timings = []
+
+#         for i, segment_path in enumerate(segments):
+#             print(f"Processing segment {i+1}/{len(segments)}: {segment_path}")
+
+#             # Start timing for the segment
+#             segment_start_time = time.time()
+
+#             # Extract key frames
+#             frame_start_time = time.time()
+#             imgs = extract_motion_key_frames(segment_path, max_frames=50, sigma_multiplier=2)
+#             frame_time = time.time()
+
+#             # Extract audio and transcribe
+#             audio_start_time = time.time()
+#             audio_path = extract_audio_from_video(segment_path)
+#             transcribed_text = transcribe_audio(audio_path)
+#             audio_time = time.time()
+
+#             # Combine transcribed text with the query
+#             combined_query = f"Audio Transcript: {transcribed_text}\n{question}"
+
+#             # Prepare content for the pipeline
+#             question_with_frames = ""
+#             for j, img in enumerate(imgs):
+#                 question_with_frames += f"Frame{j+1}: {{IMAGE_TOKEN}}\n"
+#             question_with_frames += combined_query
+
+#             content = [{"type": "text", "text": question_with_frames}]
+#             for img in imgs:
+#                 content.append({
+#                     "type": "image_url",
+#                     "image_url": {
+#                         "max_dynamic_patch": 1,
+#                         "url": f"data:image/jpeg;base64,{encode_image_base64(img)}"
+#                     }
+#                 })
+
+#             # Query the model
+#             inference_start_time = time.time()
+#             messages = [dict(role="user", content=content)]
+#             response = await asyncio.to_thread(pipe, messages)
+#             inference_time = time.time()
+
+#             # Aggregate response
+#             aggregated_responses.append(response.text)
+
+#             # Calculate timing for the segment
+#             segment_timings.append({
+#                 "segment_index": i + 1,
+#                 "segment_processing_time": inference_time - segment_start_time,
+#                 "frame_extraction_time": frame_time - frame_start_time,
+#                 "audio_extraction_time": audio_time - audio_start_time,
+#                 "model_inference_time": inference_time - inference_start_time
+#             })
+
+#             print(f"transcription: {transcribed_text}")
+
+#         overall_end_time = time.time()
+
+#         # Aggregate total timings
+#         total_timings = {
+#             "video_reading_time": video_reading_time - overall_start_time,
+#             "total_segments": len(segments),
+#             "total_processing_time": overall_end_time - overall_start_time,
+#             "segment_details": segment_timings
+#         }
+
+#         return {
+#             "question": question,
+#             "responses": aggregated_responses,
+#             "timings": total_timings,
+#         }
+#     except Exception as e:
+#         return {"query": question, "error": str(e)}
--- a/main.py
+++ b/main.py
@ -0,0 +1,18 @@
+from fastapi import FastAPI
+from endpoints.text import text_query
+from endpoints.image import image_query
+from endpoints.video import video_query
+
+app = FastAPI()
+
+# Register routes
+app.post("/api/text")(text_query)
+app.post("/api/image")(image_query)
+app.post("/api/video")(video_query)
+
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run("main:app", host="0.0.0.0", port=8080, reload=True)
+
+# python main.py
+# uvicorn main:app --reload
--- a/pipeline_setup.py
+++ b/pipeline_setup.py
@ -0,0 +1,19 @@
+from lmdeploy import pipeline, TurbomindEngineConfig, GenerationConfig
+
+# Constants
+IMAGE_TOKEN = "[IMAGE_TOKEN]"
+
+# Model initialization
+model = "OpenGVLab/InternVL2-26B-AWQ"
+pipe = pipeline(
+    model,
+    backend_config=TurbomindEngineConfig(
+        model_format="awq",
+        tp=4,
+        session_len=12864,
+        max_batch_size=1,
+        cache_max_entry_count=0.05,
+        cache_block_seq_len=32768,
+        quant_policy=4
+    )
+)
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,117 @@
+accelerate==1.2.1
+addict==2.4.0
+aiohappyeyeballs==2.4.4
+aiohttp==3.11.11
+aiosignal==1.3.2
+annotated-types==0.7.0
+anyio==4.7.0
+attrs==24.3.0
+bitsandbytes==0.45.0
+certifi==2024.12.14
+charset-normalizer==3.4.0
+click==8.1.8
+cloudpickle==3.1.0
+datasets==3.2.0
+decord==0.6.0
+dill==0.3.8
+diskcache==5.6.3
+distro==1.9.0
+einops==0.8.0
+fastapi==0.115.6
+filelock==3.16.1
+fire==0.7.0
+# flash-attn==2.7.2.post1
+frozenlist==1.5.0
+fsspec==2024.9.0
+h11==0.14.0
+httpcore==1.0.7
+httpx==0.28.1
+huggingface-hub==0.27.0
+idna==3.10
+interegular==0.3.3
+Jinja2==3.1.5
+jiter==0.8.2
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+lark==1.2.2
+llvmlite==0.43.0
+lmdeploy==0.6.4
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+mdurl==0.1.2
+mmengine-lite==0.10.5
+modelscope==1.21.0
+mpmath==1.3.0
+multidict==6.1.0
+multiprocess==0.70.16
+nest-asyncio==1.6.0
+networkx==3.4.2
+ninja==1.11.1.3
+numba==0.60.0
+numpy==1.26.4
+nvidia-cublas-cu12==12.1.3.1
+nvidia-cuda-cupti-cu12==12.1.105
+nvidia-cuda-nvrtc-cu12==12.1.105
+nvidia-cuda-runtime-cu12==12.1.105
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.0.2.54
+nvidia-curand-cu12==10.3.2.106
+nvidia-cusolver-cu12==11.4.5.107
+nvidia-cusparse-cu12==12.1.0.106
+nvidia-ml-py==12.560.30
+nvidia-nccl-cu12==2.20.5
+nvidia-nvjitlink-cu12==12.6.85
+nvidia-nvtx-cu12==12.1.105
+openai==1.58.1
+opencv-python==4.10.0.84
+outlines==0.0.46
+packaging==24.2
+pandas==2.2.3
+peft==0.11.1
+pillow==11.0.0
+platformdirs==4.3.6
+propcache==0.2.1
+protobuf==5.29.2
+psutil==6.1.1
+pyairports==2.1.1
+pyarrow==18.1.0
+pycountry==24.6.1
+pydantic==2.10.4
+pydantic_core==2.27.2
+Pygments==2.18.0
+pynvml==12.0.0
+python-dateutil==2.9.0.post0
+python-multipart==0.0.20
+pytz==2024.2
+PyYAML==6.0.2
+referencing==0.35.1
+regex==2024.11.6
+requests==2.32.3
+rich==13.9.4
+rpds-py==0.22.3
+safetensors==0.4.5
+sentencepiece==0.2.0
+setuptools==75.6.0
+shortuuid==1.0.13
+six==1.17.0
+sniffio==1.3.1
+starlette==0.41.3
+sympy==1.13.3
+termcolor==2.5.0
+tiktoken==0.8.0
+timm==1.0.12
+tokenizers==0.21.0
+# torch==2.4.0
+# torchaudio==2.4.0
+# torchvision==0.19.0
+tqdm==4.67.1
+transformers==4.47.1
+triton==3.0.0
+typing_extensions==4.12.2
+tzdata==2024.2
+urllib3==2.3.0
+uvicorn==0.34.0
+wheel==0.45.1
+xxhash==3.5.0
+yapf==0.43.0
+yarl==1.18.3
--- a/ui.py
+++ b/ui.py
@ -0,0 +1,63 @@
+import gradio as gr
+import asyncio
+from endpoints.text import text_query
+from endpoints.image import image_query
+from endpoints.video import video_query
+
+def setup_ui():
+    with gr.Blocks() as ui:
+        gr.Markdown(
+            """
+            # Multimodal Query Interface
+            Submit text, image, or video queries and get insights powered by APIs.
+            """
+        )
+
+        # Tabbed layout
+        with gr.Tabs():
+            # Text Query Tab
+            with gr.Tab("Text Query"):
+                gr.Markdown("### Submit a Text Query")
+                with gr.Row():
+                    text_input = gr.Textbox(label="Your Question", placeholder="Type your question here...")
+                    text_button = gr.Button("Submit")
+                text_output = gr.Textbox(label="Response", interactive=False)
+                text_button.click(
+                    fn=lambda q: asyncio.run(text_query(q)),
+                    inputs=[text_input],
+                    outputs=[text_output]
+                )
+
+            # Image Query Tab
+            with gr.Tab("Image Query"):
+                gr.Markdown("### Submit an Image Query")
+                with gr.Row():
+                    image_input = gr.File(label="Upload Image")
+                    image_question_input = gr.Textbox(label="Your Question", placeholder="Type your question here...")
+                    image_button = gr.Button("Submit")
+                image_output = gr.Textbox(label="Response", interactive=False)
+                image_button.click(
+                    fn=lambda img, q: asyncio.run(image_query(img, q)),
+                    inputs=[image_input, image_question_input],
+                    outputs=[image_output]
+                )
+
+            # Video Query Tab
+            with gr.Tab("Video Query"):
+                gr.Markdown("### Submit a Video Query")
+                with gr.Row():
+                    video_input = gr.File(label="Upload Video")
+                    video_question_input = gr.Textbox(label="Your Question", placeholder="Type your question here...")
+                    video_button = gr.Button("Submit")
+                video_output = gr.Textbox(label="Response", interactive=False)
+                video_button.click(
+                    fn=lambda vid, q: asyncio.run(video_query(vid, q)),
+                    inputs=[video_input, video_question_input],
+                    outputs=[video_output]
+                )
+
+    return ui
+
+if __name__ == "__main__":
+    ui = setup_ui()
+    ui.launch(server_name="0.0.0.0", server_port=7860)
--- a/utils/pycache/audio_transcription.cpython-312.pyc
+++ b/utils/pycache/audio_transcription.cpython-312.pyc
--- a/utils/pycache/image_processing.cpython-312.pyc
+++ b/utils/pycache/image_processing.cpython-312.pyc
--- a/utils/pycache/video_processing.cpython-312.pyc
+++ b/utils/pycache/video_processing.cpython-312.pyc
--- a/utils/audio_transcription.py
+++ b/utils/audio_transcription.py
@ -0,0 +1,13 @@
+from pydub import AudioSegment
+from whisper import load_model
+
+def extract_audio_from_video(video_path: str) -> str:
+    audio = AudioSegment.from_file(video_path)
+    audio_path = "/tmp/temp_audio.wav"
+    audio.export(audio_path, format="wav")
+    return audio_path
+
+def transcribe_audio(audio_path: str) -> str:
+    model = load_model("base")
+    result = model.transcribe(audio_path)
+    return result["text"]
--- a/utils/image_processing.py
+++ b/utils/image_processing.py
@ -0,0 +1,12 @@
+import io
+import base64
+from PIL import Image
+
+def encode_image_base64(image: Image.Image) -> str:
+    """
+    Encode a PIL Image to a Base64 string.
+    """
+    buffered = io.BytesIO()
+    image.save(buffered, format="JPEG")
+    return base64.b64encode(buffered.getvalue()).decode()
+    
--- a/utils/video_processing.py
+++ b/utils/video_processing.py
@ -0,0 +1,93 @@
+import cv2
+import os
+import subprocess
+import numpy as np
+from PIL import Image
+from pydub import AudioSegment
+from decord import VideoReader, cpu
+from concurrent.futures import ThreadPoolExecutor
+
+def split_video_into_segments(video_path, segment_duration=30):
+    """
+    Splits a video into segments of a specified duration using FFmpeg.
+    """
+    output_dir = "/tmp/video_segments"
+    os.makedirs(output_dir, exist_ok=True)
+
+    # Calculate total duration of the video
+    cap = cv2.VideoCapture(video_path)
+    fps = int(cap.get(cv2.CAP_PROP_FPS))
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    total_duration = total_frames / fps
+    cap.release()
+
+    segments = []
+    for start_time in range(0, int(total_duration), segment_duration):
+        segment_file = os.path.join(output_dir, f"segment_{start_time}.mp4")
+        command = [
+            "ffmpeg", "-i", video_path,
+            "-ss", str(start_time),
+            "-t", str(segment_duration),
+            "-c", "copy", segment_file
+        ]
+        subprocess.run(command, check=True)
+        segments.append(segment_file)
+
+    return segments
+
+def extract_motion_key_frames(video_path, max_frames=20, sigma_multiplier=2, frame_interval=1):
+    """
+    Extracts key frames from a video based on motion intensity.
+    """
+    def calculate_motion(frame_pair):
+        """
+        Calculates motion between two consecutive frames using optical flow.
+        """
+        prev_gray, current_frame = frame_pair
+        current_gray = cv2.cvtColor(current_frame, cv2.COLOR_BGR2GRAY)
+        flow = cv2.calcOpticalFlowFarneback(prev_gray, current_gray, None, 0.5, 3, 15, 3, 5, 1.2, 0)
+        motion = np.sum(flow ** 2)
+        return motion, current_gray
+
+    # Load video frames using Decord
+    video = VideoReader(video_path, ctx=cpu(0))
+    frames_batch = video.get_batch(range(0, len(video), frame_interval)).asnumpy()
+
+    # Resize frames for faster processing
+    frames = [cv2.resize(frame, (frame.shape[1] // 2, frame.shape[0] // 2)) for frame in frames_batch]
+
+    # Initialize the first frame
+    prev_gray = cv2.cvtColor(frames[0], cv2.COLOR_BGR2GRAY)
+    frame_pairs = [(prev_gray, frames[i]) for i in range(1, len(frames))]
+
+    # Calculate motion statistics
+    motion_values = []
+    with ThreadPoolExecutor() as executor:
+        motion_results = list(executor.map(calculate_motion, frame_pairs))
+    motion_values = [motion for motion, _ in motion_results]
+
+    # Calculate threshold statistically
+    motion_mean = np.mean(motion_values)
+    motion_std = np.std(motion_values)
+    threshold = motion_mean + sigma_multiplier * motion_std
+
+    # Extract key frames based on motion threshold
+    key_frames = []
+    for i, (motion, frame) in enumerate(zip(motion_values, frames[1:])):
+        if motion > threshold and len(key_frames) < max_frames:
+            img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+            key_frames.append(img)
+
+    return key_frames
+
+def extract_audio_from_video(video_path):
+    """
+    Extract audio from video using pydub and save as a temporary audio file.
+    """
+    print("Audio extraction started...")
+    audio = AudioSegment.from_file(video_path)
+    print("Audio extraction completed.")
+    audio_path = "/tmp/temp_audio.wav"
+    audio.export(audio_path, format="wav")
+    print(f"Audio extracted and saved to: {audio_path}")
+    return audio_path