updated ui

2025-01-23 21:50:55 +08:00 · 2025-01-23 21:50:55 +08:00 · 31a7abea14
commit 31a7abea14
parent 0eb6bdc9c4
18 changed files with 675 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@ -11,3 +11,5 @@
 *.app
 .snapshots/*
 __pycache__/*
 endpoints/__pycache__/*
--- a/31
+++ b/31
@ -0,0 +1,31 @@
 FROM python:3.12-slim
 WORKDIR /home/ooin/st/app
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
    build-essential \
    libssl-dev \
    libffi-dev \
    libxml2-dev \
    libxslt1-dev \
    zlib1g-dev \
    libjpeg-dev \
    libopenblas-dev \
    libopenmpi-dev \
    && rm -rf /var/lib/apt/lists/*
 COPY requirements.txt .
 RUN pip install --upgrade pip
 RUN pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 flash-attn==2.7.2.post1 -f https://download.pytorch.org/whl/torch_stable.html
 RUN pip install --no-cache-dir -r requirements.txt \
    --index-url https://pypi.tuna.tsinghua.edu.cn/simple/ --timeout 100 --retries 5
 COPY . .
 EXPOSE 80
 CMD ["uvicorn", "pipeline_UI3+audio:app", "--host", "0.0.0.0", "--port", "80"]
--- a/README.md
+++ b/README.md
@ -1,2 +1,2 @@
-# tiktok_AI
+# api
--- a/pycache/main.cpython-312.pyc
+++ b/pycache/main.cpython-312.pyc
--- a/pycache/pipeline_setup.cpython-312.pyc
+++ b/pycache/pipeline_setup.cpython-312.pyc
--- a/endpoints/image.py
+++ b/endpoints/image.py
@ -0,0 +1,53 @@
 from fastapi import UploadFile, Form
 from fastapi.responses import JSONResponse
 import io
 from PIL import Image
 from pipeline_setup import pipe, IMAGE_TOKEN
 from utils.image_processing import encode_image_base64
 async def image_query(file: UploadFile, question: str = Form(...)):
    """
    API endpoint to process an image with the user's query.
    """
    try:
        if file.content_type not in ["image/jpeg", "image/png"]:
            return JSONResponse({"query": question, "error": "Unsupported file type."})
        image_data = await file.read()
        image = Image.open(io.BytesIO(image_data)).convert("RGB").resize((512, 512))
        encoded_image_base64 = encode_image_base64(image)
        question_with_image_token = f"{question}\n{IMAGE_TOKEN}"
        response = await asyncio.to_thread(pipe, (question, image))
        return JSONResponse({"query": question, "response": response.text})
    except Exception as e:
        return JSONResponse({"query": question, "error": str(e)})
 # import mimetypes
 # async def image_query(file: UploadFile, question: str = Form(...)):
 #     """
 #     API endpoint to process an image with the user's query.
 #     """
 #     try:
 #         # Get the file path from the UploadFile object
 #         file_path = file.filename
 #         # Determine the file type using the file extension
 #         file_type, _ = mimetypes.guess_type(file_path)
 #         if file_type not in ["image/jpeg", "image/png"]:
 #             return {"query": question, "error": "Unsupported file type."}
 #         # Read the image file
 #         image_data = await file.read()
 #         image = Image.open(io.BytesIO(image_data)).convert("RGB").resize((512, 512))
 #         encoded_image_base64 = encode_image_base64(image)
 #         # Prepare the query with the image token
 #         question_with_image_token = f"{question}\n{IMAGE_TOKEN}"
 #         # Query the model
 #         response = await asyncio.to_thread(pipe, (question, image))
 #         return {"query": question, "response": response.text}
 #     except Exception as e:
 #         return {"query": question, "error": str(e)}
--- a/endpoints/text.py
+++ b/endpoints/text.py
@ -0,0 +1,24 @@
 from fastapi import Form
 from fastapi.responses import JSONResponse
 from asyncio import to_thread
 from pipeline_setup import pipe
 async def text_query(question: str = Form(...)):
    """
    API endpoint to process text input with the user's query.
    """
    try:
        response = await to_thread(pipe, question)
        return JSONResponse({"query": question, "response": response.text})
    except Exception as e:
        return JSONResponse({"query": question, "error": str(e)})
 # async def text_query(question: str = Form(...)):
 #     """
 #     API endpoint to process text input with the user's query.
 #     """
 #     try:
 #         response = await to_thread(pipe, question)
 #         return {"query": question, "response": response.text}
 #     except Exception as e:
 #         return {"query": question, "error": str(e)}
--- a/endpoints/video.py
+++ b/endpoints/video.py
@ -0,0 +1,229 @@
 from fastapi import UploadFile, Form
 from fastapi.responses import JSONResponse
 from pipeline_setup import pipe
 from utils.image_processing import encode_image_base64
 from utils.video_processing import split_video_into_segments, extract_motion_key_frames, extract_audio_from_video
 from utils.audio_transcription import transcribe_audio
 import asyncio
 import time
 from concurrent.futures import ThreadPoolExecutor
 async def video_query(file: UploadFile, question: str = Form(...)):
    """
    API endpoint to process a video file with the user's query.
    """
    try:
        print("Processing video...")
        # Validate file type
        if file.content_type not in ["video/mp4", "video/avi", "video/mkv"]:
            return JSONResponse({"query": question, "error": "Unsupported video file type."})
        # Start overall timer
        overall_start_time = time.time()
        # Save the uploaded video to a temporary file
        print("Reading video...")
        video_data = await file.read()
        temp_video_path = "/tmp/temp_video.mp4"
        with open(temp_video_path, "wb") as temp_video_file:
            temp_video_file.write(video_data)
        print(f"Temp video saved to: {temp_video_path}")
        # Record the time after reading the video
        video_reading_time = time.time()
        # Split the video into segments
        print("Splitting video...")
        segments = split_video_into_segments(temp_video_path, segment_duration=30)
        print(f"Video split into {len(segments)} segments.")
        aggregated_responses = []
        segment_timings = []
        for i, segment_path in enumerate(segments):
            print(f"Processing segment {i+1}/{len(segments)}: {segment_path}")
            # Start timing for the segment
            segment_start_time = time.time()
            # Extract key frames
            frame_start_time = time.time()
            imgs = extract_motion_key_frames(segment_path, max_frames=50, sigma_multiplier=2)
            frame_time = time.time()
            # Extract audio and transcribe
            audio_start_time = time.time()
            audio_path = extract_audio_from_video(segment_path)
            transcribed_text = transcribe_audio(audio_path)
            audio_time = time.time()
            # Combine transcribed text with the query
            combined_query = f"Audio Transcript: {transcribed_text}\n{question}"
            # Prepare content for the pipeline
            question_with_frames = ""
            for j, img in enumerate(imgs):
                question_with_frames += f"Frame{j+1}: {{IMAGE_TOKEN}}\n"
            question_with_frames += combined_query
            content = [{"type": "text", "text": question_with_frames}]
            for img in imgs:
                content.append({
                    "type": "image_url",
                    "image_url": {
                        "max_dynamic_patch": 1,
                        "url": f"data:image/jpeg;base64,{encode_image_base64(img)}"
                    }
                })
            # Query the model
            inference_start_time = time.time()
            messages = [dict(role="user", content=content)]
            response = await asyncio.to_thread(pipe, messages)
            inference_time = time.time()
            # Aggregate response
            aggregated_responses.append(response.text)
            # Calculate timing for the segment
            segment_timings.append({
                "segment_index": i + 1,
                "segment_processing_time": inference_time - segment_start_time,
                "frame_extraction_time": frame_time - frame_start_time,
                "audio_extraction_time": audio_time - audio_start_time,
                "model_inference_time": inference_time - inference_start_time
            })
            print(f"transcription: {transcribed_text}")
            # print(f"content: {content}")
        overall_end_time = time.time()
        # Aggregate total timings
        total_timings = {
            "video_reading_time": video_reading_time - overall_start_time,
            "total_segments": len(segments),
            "total_processing_time": overall_end_time - overall_start_time,
            "segment_details": segment_timings
        }
        return JSONResponse({
            "question": question,
            "responses": aggregated_responses,
            "timings": total_timings,
        })
    except Exception as e:
        return JSONResponse({"query": question, "error": str(e)})
 # async def video_query(file: UploadFile, question: str = Form(...)):
 #     """
 #     API endpoint to process a video file with the user's query.
 #     """
 #     try:
 #         print("Processing video...")
 #         # Get the file path from the UploadFile object
 #         file_path = file.filename
 #         # Determine the file type using the file extension
 #         file_type, _ = mimetypes.guess_type(file_path)
 #         if file_type is None or not file_type.startswith("video/"):
 #             return {"query": question, "error": "Unsupported video file type."}
 #         # Start overall timer
 #         overall_start_time = time.time()
 #         # Save the uploaded video to a temporary file
 #         print("Reading video...")
 #         video_data = await file.read()
 #         temp_video_path = "/tmp/temp_video.mp4"
 #         with open(temp_video_path, "wb") as temp_video_file:
 #             temp_video_file.write(video_data)
 #         print(f"Temp video saved to: {temp_video_path}")
 #         # Record the time after reading the video
 #         video_reading_time = time.time()
 #         # Split the video into segments
 #         print("Splitting video...")
 #         segments = split_video_into_segments(temp_video_path, segment_duration=30)
 #         print(f"Video split into {len(segments)} segments.")
 #         aggregated_responses = []
 #         segment_timings = []
 #         for i, segment_path in enumerate(segments):
 #             print(f"Processing segment {i+1}/{len(segments)}: {segment_path}")
 #             # Start timing for the segment
 #             segment_start_time = time.time()
 #             # Extract key frames
 #             frame_start_time = time.time()
 #             imgs = extract_motion_key_frames(segment_path, max_frames=50, sigma_multiplier=2)
 #             frame_time = time.time()
 #             # Extract audio and transcribe
 #             audio_start_time = time.time()
 #             audio_path = extract_audio_from_video(segment_path)
 #             transcribed_text = transcribe_audio(audio_path)
 #             audio_time = time.time()
 #             # Combine transcribed text with the query
 #             combined_query = f"Audio Transcript: {transcribed_text}\n{question}"
 #             # Prepare content for the pipeline
 #             question_with_frames = ""
 #             for j, img in enumerate(imgs):
 #                 question_with_frames += f"Frame{j+1}: {{IMAGE_TOKEN}}\n"
 #             question_with_frames += combined_query
 #             content = [{"type": "text", "text": question_with_frames}]
 #             for img in imgs:
 #                 content.append({
 #                     "type": "image_url",
 #                     "image_url": {
 #                         "max_dynamic_patch": 1,
 #                         "url": f"data:image/jpeg;base64,{encode_image_base64(img)}"
 #                     }
 #                 })
 #             # Query the model
 #             inference_start_time = time.time()
 #             messages = [dict(role="user", content=content)]
 #             response = await asyncio.to_thread(pipe, messages)
 #             inference_time = time.time()
 #             # Aggregate response
 #             aggregated_responses.append(response.text)
 #             # Calculate timing for the segment
 #             segment_timings.append({
 #                 "segment_index": i + 1,
 #                 "segment_processing_time": inference_time - segment_start_time,
 #                 "frame_extraction_time": frame_time - frame_start_time,
 #                 "audio_extraction_time": audio_time - audio_start_time,
 #                 "model_inference_time": inference_time - inference_start_time
 #             })
 #             print(f"transcription: {transcribed_text}")
 #         overall_end_time = time.time()
 #         # Aggregate total timings
 #         total_timings = {
 #             "video_reading_time": video_reading_time - overall_start_time,
 #             "total_segments": len(segments),
 #             "total_processing_time": overall_end_time - overall_start_time,
 #             "segment_details": segment_timings
 #         }
 #         return {
 #             "question": question,
 #             "responses": aggregated_responses,
 #             "timings": total_timings,
 #         }
 #     except Exception as e:
 #         return {"query": question, "error": str(e)}
--- a/main.py
+++ b/main.py
@ -0,0 +1,18 @@
 from fastapi import FastAPI
 from endpoints.text import text_query
 from endpoints.image import image_query
 from endpoints.video import video_query
 app = FastAPI()
 # Register routes
 app.post("/api/text")(text_query)
 app.post("/api/image")(image_query)
 app.post("/api/video")(video_query)
 if __name__ == "__main__":
    import uvicorn
    uvicorn.run("main:app", host="0.0.0.0", port=8080, reload=True)
 # python main.py
 # uvicorn main:app --reload
--- a/pipeline_setup.py
+++ b/pipeline_setup.py
@ -0,0 +1,19 @@
 from lmdeploy import pipeline, TurbomindEngineConfig, GenerationConfig
 # Constants
 IMAGE_TOKEN = "[IMAGE_TOKEN]"
 # Model initialization
 model = "OpenGVLab/InternVL2-26B-AWQ"
 pipe = pipeline(
    model,
    backend_config=TurbomindEngineConfig(
        model_format="awq",
        tp=4,
        session_len=12864,
        max_batch_size=1,
        cache_max_entry_count=0.05,
        cache_block_seq_len=32768,
        quant_policy=4
    )
 )
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,117 @@
 accelerate==1.2.1
 addict==2.4.0
 aiohappyeyeballs==2.4.4
 aiohttp==3.11.11
 aiosignal==1.3.2
 annotated-types==0.7.0
 anyio==4.7.0
 attrs==24.3.0
 bitsandbytes==0.45.0
 certifi==2024.12.14
 charset-normalizer==3.4.0
 click==8.1.8
 cloudpickle==3.1.0
 datasets==3.2.0
 decord==0.6.0
 dill==0.3.8
 diskcache==5.6.3
 distro==1.9.0
 einops==0.8.0
 fastapi==0.115.6
 filelock==3.16.1
 fire==0.7.0
 # flash-attn==2.7.2.post1
 frozenlist==1.5.0
 fsspec==2024.9.0
 h11==0.14.0
 httpcore==1.0.7
 httpx==0.28.1
 huggingface-hub==0.27.0
 idna==3.10
 interegular==0.3.3
 Jinja2==3.1.5
 jiter==0.8.2
 jsonschema==4.23.0
 jsonschema-specifications==2024.10.1
 lark==1.2.2
 llvmlite==0.43.0
 lmdeploy==0.6.4
 markdown-it-py==3.0.0
 MarkupSafe==3.0.2
 mdurl==0.1.2
 mmengine-lite==0.10.5
 modelscope==1.21.0
 mpmath==1.3.0
 multidict==6.1.0
 multiprocess==0.70.16
 nest-asyncio==1.6.0
 networkx==3.4.2
 ninja==1.11.1.3
 numba==0.60.0
 numpy==1.26.4
 nvidia-cublas-cu12==12.1.3.1
 nvidia-cuda-cupti-cu12==12.1.105
 nvidia-cuda-nvrtc-cu12==12.1.105
 nvidia-cuda-runtime-cu12==12.1.105
 nvidia-cudnn-cu12==9.1.0.70
 nvidia-cufft-cu12==11.0.2.54
 nvidia-curand-cu12==10.3.2.106
 nvidia-cusolver-cu12==11.4.5.107
 nvidia-cusparse-cu12==12.1.0.106
 nvidia-ml-py==12.560.30
 nvidia-nccl-cu12==2.20.5
 nvidia-nvjitlink-cu12==12.6.85
 nvidia-nvtx-cu12==12.1.105
 openai==1.58.1
 opencv-python==4.10.0.84
 outlines==0.0.46
 packaging==24.2
 pandas==2.2.3
 peft==0.11.1
 pillow==11.0.0
 platformdirs==4.3.6
 propcache==0.2.1
 protobuf==5.29.2
 psutil==6.1.1
 pyairports==2.1.1
 pyarrow==18.1.0
 pycountry==24.6.1
 pydantic==2.10.4
 pydantic_core==2.27.2
 Pygments==2.18.0
 pynvml==12.0.0
 python-dateutil==2.9.0.post0
 python-multipart==0.0.20
 pytz==2024.2
 PyYAML==6.0.2
 referencing==0.35.1
 regex==2024.11.6
 requests==2.32.3
 rich==13.9.4
 rpds-py==0.22.3
 safetensors==0.4.5
 sentencepiece==0.2.0
 setuptools==75.6.0
 shortuuid==1.0.13
 six==1.17.0
 sniffio==1.3.1
 starlette==0.41.3
 sympy==1.13.3
 termcolor==2.5.0
 tiktoken==0.8.0
 timm==1.0.12
 tokenizers==0.21.0
 # torch==2.4.0
 # torchaudio==2.4.0
 # torchvision==0.19.0
 tqdm==4.67.1
 transformers==4.47.1
 triton==3.0.0
 typing_extensions==4.12.2
 tzdata==2024.2
 urllib3==2.3.0
 uvicorn==0.34.0
 wheel==0.45.1
 xxhash==3.5.0
 yapf==0.43.0
 yarl==1.18.3
--- a/ui.py
+++ b/ui.py
@ -0,0 +1,63 @@
 import gradio as gr
 import asyncio
 from endpoints.text import text_query
 from endpoints.image import image_query
 from endpoints.video import video_query
 def setup_ui():
    with gr.Blocks() as ui:
        gr.Markdown(
            """
            # Multimodal Query Interface
            Submit text, image, or video queries and get insights powered by APIs.
            """
        )
        # Tabbed layout
        with gr.Tabs():
            # Text Query Tab
            with gr.Tab("Text Query"):
                gr.Markdown("### Submit a Text Query")
                with gr.Row():
                    text_input = gr.Textbox(label="Your Question", placeholder="Type your question here...")
                    text_button = gr.Button("Submit")
                text_output = gr.Textbox(label="Response", interactive=False)
                text_button.click(
                    fn=lambda q: asyncio.run(text_query(q)),
                    inputs=[text_input],
                    outputs=[text_output]
                )
            # Image Query Tab
            with gr.Tab("Image Query"):
                gr.Markdown("### Submit an Image Query")
                with gr.Row():
                    image_input = gr.File(label="Upload Image")
                    image_question_input = gr.Textbox(label="Your Question", placeholder="Type your question here...")
                    image_button = gr.Button("Submit")
                image_output = gr.Textbox(label="Response", interactive=False)
                image_button.click(
                    fn=lambda img, q: asyncio.run(image_query(img, q)),
                    inputs=[image_input, image_question_input],
                    outputs=[image_output]
                )
            # Video Query Tab
            with gr.Tab("Video Query"):
                gr.Markdown("### Submit a Video Query")
                with gr.Row():
                    video_input = gr.File(label="Upload Video")
                    video_question_input = gr.Textbox(label="Your Question", placeholder="Type your question here...")
                    video_button = gr.Button("Submit")
                video_output = gr.Textbox(label="Response", interactive=False)
                video_button.click(
                    fn=lambda vid, q: asyncio.run(video_query(vid, q)),
                    inputs=[video_input, video_question_input],
                    outputs=[video_output]
                )
    return ui
 if __name__ == "__main__":
    ui = setup_ui()
    ui.launch(server_name="0.0.0.0", server_port=7860)
--- a/utils/pycache/audio_transcription.cpython-312.pyc
+++ b/utils/pycache/audio_transcription.cpython-312.pyc
--- a/utils/pycache/image_processing.cpython-312.pyc
+++ b/utils/pycache/image_processing.cpython-312.pyc
--- a/utils/pycache/video_processing.cpython-312.pyc
+++ b/utils/pycache/video_processing.cpython-312.pyc
--- a/utils/audio_transcription.py
+++ b/utils/audio_transcription.py
@ -0,0 +1,13 @@
 from pydub import AudioSegment
 from whisper import load_model
 def extract_audio_from_video(video_path: str) -> str:
    audio = AudioSegment.from_file(video_path)
    audio_path = "/tmp/temp_audio.wav"
    audio.export(audio_path, format="wav")
    return audio_path
 def transcribe_audio(audio_path: str) -> str:
    model = load_model("base")
    result = model.transcribe(audio_path)
    return result["text"]
--- a/utils/image_processing.py
+++ b/utils/image_processing.py
@ -0,0 +1,12 @@
 import io
 import base64
 from PIL import Image
 def encode_image_base64(image: Image.Image) -> str:
    """
    Encode a PIL Image to a Base64 string.
    """
    buffered = io.BytesIO()
    image.save(buffered, format="JPEG")
    return base64.b64encode(buffered.getvalue()).decode()
--- a/utils/video_processing.py
+++ b/utils/video_processing.py
@ -0,0 +1,93 @@
 import cv2
 import os
 import subprocess
 import numpy as np
 from PIL import Image
 from pydub import AudioSegment
 from decord import VideoReader, cpu
 from concurrent.futures import ThreadPoolExecutor
 def split_video_into_segments(video_path, segment_duration=30):
    """
    Splits a video into segments of a specified duration using FFmpeg.
    """
    output_dir = "/tmp/video_segments"
    os.makedirs(output_dir, exist_ok=True)
    # Calculate total duration of the video
    cap = cv2.VideoCapture(video_path)
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    total_duration = total_frames / fps
    cap.release()
    segments = []
    for start_time in range(0, int(total_duration), segment_duration):
        segment_file = os.path.join(output_dir, f"segment_{start_time}.mp4")
        command = [
            "ffmpeg", "-i", video_path,
            "-ss", str(start_time),
            "-t", str(segment_duration),
            "-c", "copy", segment_file
        ]
        subprocess.run(command, check=True)
        segments.append(segment_file)
    return segments
 def extract_motion_key_frames(video_path, max_frames=20, sigma_multiplier=2, frame_interval=1):
    """
    Extracts key frames from a video based on motion intensity.
    """
    def calculate_motion(frame_pair):
        """
        Calculates motion between two consecutive frames using optical flow.
        """
        prev_gray, current_frame = frame_pair
        current_gray = cv2.cvtColor(current_frame, cv2.COLOR_BGR2GRAY)
        flow = cv2.calcOpticalFlowFarneback(prev_gray, current_gray, None, 0.5, 3, 15, 3, 5, 1.2, 0)
        motion = np.sum(flow ** 2)
        return motion, current_gray
    # Load video frames using Decord
    video = VideoReader(video_path, ctx=cpu(0))
    frames_batch = video.get_batch(range(0, len(video), frame_interval)).asnumpy()
    # Resize frames for faster processing
    frames = [cv2.resize(frame, (frame.shape[1] // 2, frame.shape[0] // 2)) for frame in frames_batch]
    # Initialize the first frame
    prev_gray = cv2.cvtColor(frames[0], cv2.COLOR_BGR2GRAY)
    frame_pairs = [(prev_gray, frames[i]) for i in range(1, len(frames))]
    # Calculate motion statistics
    motion_values = []
    with ThreadPoolExecutor() as executor:
        motion_results = list(executor.map(calculate_motion, frame_pairs))
    motion_values = [motion for motion, _ in motion_results]
    # Calculate threshold statistically
    motion_mean = np.mean(motion_values)
    motion_std = np.std(motion_values)
    threshold = motion_mean + sigma_multiplier * motion_std
    # Extract key frames based on motion threshold
    key_frames = []
    for i, (motion, frame) in enumerate(zip(motion_values, frames[1:])):
        if motion > threshold and len(key_frames) < max_frames:
            img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
            key_frames.append(img)
    return key_frames
 def extract_audio_from_video(video_path):
    """
    Extract audio from video using pydub and save as a temporary audio file.
    """
    print("Audio extraction started...")
    audio = AudioSegment.from_file(video_path)
    print("Audio extraction completed.")
    audio_path = "/tmp/temp_audio.wav"
    audio.export(audio_path, format="wav")
    print(f"Audio extracted and saved to: {audio_path}")
    return audio_path