diff --git a/.gitignore b/.gitignore index 60d7971..f479596 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,5 @@ *.app .snapshots/* + __pycache__/* +endpoints/__pycache__/* \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..9a038e2 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,31 @@ +FROM python:3.12-slim + +WORKDIR /home/ooin/st/app + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + build-essential \ + libssl-dev \ + libffi-dev \ + libxml2-dev \ + libxslt1-dev \ + zlib1g-dev \ + libjpeg-dev \ + libopenblas-dev \ + libopenmpi-dev \ + && rm -rf /var/lib/apt/lists/* + +COPY requirements.txt . + +RUN pip install --upgrade pip + +RUN pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 flash-attn==2.7.2.post1 -f https://download.pytorch.org/whl/torch_stable.html + +RUN pip install --no-cache-dir -r requirements.txt \ + --index-url https://pypi.tuna.tsinghua.edu.cn/simple/ --timeout 100 --retries 5 + +COPY . . + +EXPOSE 80 + +CMD ["uvicorn", "pipeline_UI3+audio:app", "--host", "0.0.0.0", "--port", "80"] \ No newline at end of file diff --git a/README.md b/README.md index cc8efb3..a1df0ab 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,2 @@ -# tiktok_AI +# api diff --git a/__pycache__/main.cpython-312.pyc b/__pycache__/main.cpython-312.pyc new file mode 100644 index 0000000..c2dbc97 Binary files /dev/null and b/__pycache__/main.cpython-312.pyc differ diff --git a/__pycache__/pipeline_setup.cpython-312.pyc b/__pycache__/pipeline_setup.cpython-312.pyc new file mode 100644 index 0000000..6ed9844 Binary files /dev/null and b/__pycache__/pipeline_setup.cpython-312.pyc differ diff --git a/endpoints/image.py b/endpoints/image.py new file mode 100644 index 0000000..e46c558 --- /dev/null +++ b/endpoints/image.py @@ -0,0 +1,53 @@ +from fastapi import UploadFile, Form +from fastapi.responses import JSONResponse +import io +from PIL import Image +from pipeline_setup import pipe, IMAGE_TOKEN +from utils.image_processing import encode_image_base64 + +async def image_query(file: UploadFile, question: str = Form(...)): + """ + API endpoint to process an image with the user's query. + """ + try: + if file.content_type not in ["image/jpeg", "image/png"]: + return JSONResponse({"query": question, "error": "Unsupported file type."}) + + image_data = await file.read() + image = Image.open(io.BytesIO(image_data)).convert("RGB").resize((512, 512)) + encoded_image_base64 = encode_image_base64(image) + + question_with_image_token = f"{question}\n{IMAGE_TOKEN}" + response = await asyncio.to_thread(pipe, (question, image)) + return JSONResponse({"query": question, "response": response.text}) + except Exception as e: + return JSONResponse({"query": question, "error": str(e)}) + + +# import mimetypes +# async def image_query(file: UploadFile, question: str = Form(...)): +# """ +# API endpoint to process an image with the user's query. +# """ +# try: +# # Get the file path from the UploadFile object +# file_path = file.filename + +# # Determine the file type using the file extension +# file_type, _ = mimetypes.guess_type(file_path) +# if file_type not in ["image/jpeg", "image/png"]: +# return {"query": question, "error": "Unsupported file type."} + +# # Read the image file +# image_data = await file.read() +# image = Image.open(io.BytesIO(image_data)).convert("RGB").resize((512, 512)) +# encoded_image_base64 = encode_image_base64(image) + +# # Prepare the query with the image token +# question_with_image_token = f"{question}\n{IMAGE_TOKEN}" + +# # Query the model +# response = await asyncio.to_thread(pipe, (question, image)) +# return {"query": question, "response": response.text} +# except Exception as e: +# return {"query": question, "error": str(e)} diff --git a/endpoints/text.py b/endpoints/text.py new file mode 100644 index 0000000..f1afc03 --- /dev/null +++ b/endpoints/text.py @@ -0,0 +1,24 @@ +from fastapi import Form +from fastapi.responses import JSONResponse +from asyncio import to_thread +from pipeline_setup import pipe + +async def text_query(question: str = Form(...)): + """ + API endpoint to process text input with the user's query. + """ + try: + response = await to_thread(pipe, question) + return JSONResponse({"query": question, "response": response.text}) + except Exception as e: + return JSONResponse({"query": question, "error": str(e)}) + +# async def text_query(question: str = Form(...)): +# """ +# API endpoint to process text input with the user's query. +# """ +# try: +# response = await to_thread(pipe, question) +# return {"query": question, "response": response.text} +# except Exception as e: +# return {"query": question, "error": str(e)} diff --git a/endpoints/video.py b/endpoints/video.py new file mode 100644 index 0000000..62fbc4a --- /dev/null +++ b/endpoints/video.py @@ -0,0 +1,229 @@ +from fastapi import UploadFile, Form +from fastapi.responses import JSONResponse +from pipeline_setup import pipe +from utils.image_processing import encode_image_base64 +from utils.video_processing import split_video_into_segments, extract_motion_key_frames, extract_audio_from_video +from utils.audio_transcription import transcribe_audio +import asyncio +import time +from concurrent.futures import ThreadPoolExecutor + +async def video_query(file: UploadFile, question: str = Form(...)): + """ + API endpoint to process a video file with the user's query. + """ + try: + print("Processing video...") + + # Validate file type + if file.content_type not in ["video/mp4", "video/avi", "video/mkv"]: + return JSONResponse({"query": question, "error": "Unsupported video file type."}) + + # Start overall timer + overall_start_time = time.time() + + # Save the uploaded video to a temporary file + print("Reading video...") + video_data = await file.read() + temp_video_path = "/tmp/temp_video.mp4" + with open(temp_video_path, "wb") as temp_video_file: + temp_video_file.write(video_data) + print(f"Temp video saved to: {temp_video_path}") + + # Record the time after reading the video + video_reading_time = time.time() + + # Split the video into segments + print("Splitting video...") + segments = split_video_into_segments(temp_video_path, segment_duration=30) + print(f"Video split into {len(segments)} segments.") + + aggregated_responses = [] + segment_timings = [] + + for i, segment_path in enumerate(segments): + print(f"Processing segment {i+1}/{len(segments)}: {segment_path}") + + # Start timing for the segment + segment_start_time = time.time() + + # Extract key frames + frame_start_time = time.time() + imgs = extract_motion_key_frames(segment_path, max_frames=50, sigma_multiplier=2) + frame_time = time.time() + + # Extract audio and transcribe + audio_start_time = time.time() + audio_path = extract_audio_from_video(segment_path) + transcribed_text = transcribe_audio(audio_path) + audio_time = time.time() + + # Combine transcribed text with the query + combined_query = f"Audio Transcript: {transcribed_text}\n{question}" + + # Prepare content for the pipeline + question_with_frames = "" + for j, img in enumerate(imgs): + question_with_frames += f"Frame{j+1}: {{IMAGE_TOKEN}}\n" + question_with_frames += combined_query + + content = [{"type": "text", "text": question_with_frames}] + for img in imgs: + content.append({ + "type": "image_url", + "image_url": { + "max_dynamic_patch": 1, + "url": f"data:image/jpeg;base64,{encode_image_base64(img)}" + } + }) + + # Query the model + inference_start_time = time.time() + messages = [dict(role="user", content=content)] + response = await asyncio.to_thread(pipe, messages) + inference_time = time.time() + + # Aggregate response + aggregated_responses.append(response.text) + + # Calculate timing for the segment + segment_timings.append({ + "segment_index": i + 1, + "segment_processing_time": inference_time - segment_start_time, + "frame_extraction_time": frame_time - frame_start_time, + "audio_extraction_time": audio_time - audio_start_time, + "model_inference_time": inference_time - inference_start_time + }) + + print(f"transcription: {transcribed_text}") + # print(f"content: {content}") + + overall_end_time = time.time() + + # Aggregate total timings + total_timings = { + "video_reading_time": video_reading_time - overall_start_time, + "total_segments": len(segments), + "total_processing_time": overall_end_time - overall_start_time, + "segment_details": segment_timings + } + + return JSONResponse({ + "question": question, + "responses": aggregated_responses, + "timings": total_timings, + }) + except Exception as e: + return JSONResponse({"query": question, "error": str(e)}) + + +# async def video_query(file: UploadFile, question: str = Form(...)): +# """ +# API endpoint to process a video file with the user's query. +# """ +# try: +# print("Processing video...") + +# # Get the file path from the UploadFile object +# file_path = file.filename + +# # Determine the file type using the file extension +# file_type, _ = mimetypes.guess_type(file_path) +# if file_type is None or not file_type.startswith("video/"): +# return {"query": question, "error": "Unsupported video file type."} + +# # Start overall timer +# overall_start_time = time.time() + +# # Save the uploaded video to a temporary file +# print("Reading video...") +# video_data = await file.read() +# temp_video_path = "/tmp/temp_video.mp4" +# with open(temp_video_path, "wb") as temp_video_file: +# temp_video_file.write(video_data) +# print(f"Temp video saved to: {temp_video_path}") + +# # Record the time after reading the video +# video_reading_time = time.time() + +# # Split the video into segments +# print("Splitting video...") +# segments = split_video_into_segments(temp_video_path, segment_duration=30) +# print(f"Video split into {len(segments)} segments.") + +# aggregated_responses = [] +# segment_timings = [] + +# for i, segment_path in enumerate(segments): +# print(f"Processing segment {i+1}/{len(segments)}: {segment_path}") + +# # Start timing for the segment +# segment_start_time = time.time() + +# # Extract key frames +# frame_start_time = time.time() +# imgs = extract_motion_key_frames(segment_path, max_frames=50, sigma_multiplier=2) +# frame_time = time.time() + +# # Extract audio and transcribe +# audio_start_time = time.time() +# audio_path = extract_audio_from_video(segment_path) +# transcribed_text = transcribe_audio(audio_path) +# audio_time = time.time() + +# # Combine transcribed text with the query +# combined_query = f"Audio Transcript: {transcribed_text}\n{question}" + +# # Prepare content for the pipeline +# question_with_frames = "" +# for j, img in enumerate(imgs): +# question_with_frames += f"Frame{j+1}: {{IMAGE_TOKEN}}\n" +# question_with_frames += combined_query + +# content = [{"type": "text", "text": question_with_frames}] +# for img in imgs: +# content.append({ +# "type": "image_url", +# "image_url": { +# "max_dynamic_patch": 1, +# "url": f"data:image/jpeg;base64,{encode_image_base64(img)}" +# } +# }) + +# # Query the model +# inference_start_time = time.time() +# messages = [dict(role="user", content=content)] +# response = await asyncio.to_thread(pipe, messages) +# inference_time = time.time() + +# # Aggregate response +# aggregated_responses.append(response.text) + +# # Calculate timing for the segment +# segment_timings.append({ +# "segment_index": i + 1, +# "segment_processing_time": inference_time - segment_start_time, +# "frame_extraction_time": frame_time - frame_start_time, +# "audio_extraction_time": audio_time - audio_start_time, +# "model_inference_time": inference_time - inference_start_time +# }) + +# print(f"transcription: {transcribed_text}") + +# overall_end_time = time.time() + +# # Aggregate total timings +# total_timings = { +# "video_reading_time": video_reading_time - overall_start_time, +# "total_segments": len(segments), +# "total_processing_time": overall_end_time - overall_start_time, +# "segment_details": segment_timings +# } + +# return { +# "question": question, +# "responses": aggregated_responses, +# "timings": total_timings, +# } +# except Exception as e: +# return {"query": question, "error": str(e)} \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..97d8b51 --- /dev/null +++ b/main.py @@ -0,0 +1,18 @@ +from fastapi import FastAPI +from endpoints.text import text_query +from endpoints.image import image_query +from endpoints.video import video_query + +app = FastAPI() + +# Register routes +app.post("/api/text")(text_query) +app.post("/api/image")(image_query) +app.post("/api/video")(video_query) + +if __name__ == "__main__": + import uvicorn + uvicorn.run("main:app", host="0.0.0.0", port=8080, reload=True) + +# python main.py +# uvicorn main:app --reload \ No newline at end of file diff --git a/pipeline_setup.py b/pipeline_setup.py new file mode 100644 index 0000000..e27eb31 --- /dev/null +++ b/pipeline_setup.py @@ -0,0 +1,19 @@ +from lmdeploy import pipeline, TurbomindEngineConfig, GenerationConfig + +# Constants +IMAGE_TOKEN = "[IMAGE_TOKEN]" + +# Model initialization +model = "OpenGVLab/InternVL2-26B-AWQ" +pipe = pipeline( + model, + backend_config=TurbomindEngineConfig( + model_format="awq", + tp=4, + session_len=12864, + max_batch_size=1, + cache_max_entry_count=0.05, + cache_block_seq_len=32768, + quant_policy=4 + ) +) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..8806a97 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,117 @@ +accelerate==1.2.1 +addict==2.4.0 +aiohappyeyeballs==2.4.4 +aiohttp==3.11.11 +aiosignal==1.3.2 +annotated-types==0.7.0 +anyio==4.7.0 +attrs==24.3.0 +bitsandbytes==0.45.0 +certifi==2024.12.14 +charset-normalizer==3.4.0 +click==8.1.8 +cloudpickle==3.1.0 +datasets==3.2.0 +decord==0.6.0 +dill==0.3.8 +diskcache==5.6.3 +distro==1.9.0 +einops==0.8.0 +fastapi==0.115.6 +filelock==3.16.1 +fire==0.7.0 +# flash-attn==2.7.2.post1 +frozenlist==1.5.0 +fsspec==2024.9.0 +h11==0.14.0 +httpcore==1.0.7 +httpx==0.28.1 +huggingface-hub==0.27.0 +idna==3.10 +interegular==0.3.3 +Jinja2==3.1.5 +jiter==0.8.2 +jsonschema==4.23.0 +jsonschema-specifications==2024.10.1 +lark==1.2.2 +llvmlite==0.43.0 +lmdeploy==0.6.4 +markdown-it-py==3.0.0 +MarkupSafe==3.0.2 +mdurl==0.1.2 +mmengine-lite==0.10.5 +modelscope==1.21.0 +mpmath==1.3.0 +multidict==6.1.0 +multiprocess==0.70.16 +nest-asyncio==1.6.0 +networkx==3.4.2 +ninja==1.11.1.3 +numba==0.60.0 +numpy==1.26.4 +nvidia-cublas-cu12==12.1.3.1 +nvidia-cuda-cupti-cu12==12.1.105 +nvidia-cuda-nvrtc-cu12==12.1.105 +nvidia-cuda-runtime-cu12==12.1.105 +nvidia-cudnn-cu12==9.1.0.70 +nvidia-cufft-cu12==11.0.2.54 +nvidia-curand-cu12==10.3.2.106 +nvidia-cusolver-cu12==11.4.5.107 +nvidia-cusparse-cu12==12.1.0.106 +nvidia-ml-py==12.560.30 +nvidia-nccl-cu12==2.20.5 +nvidia-nvjitlink-cu12==12.6.85 +nvidia-nvtx-cu12==12.1.105 +openai==1.58.1 +opencv-python==4.10.0.84 +outlines==0.0.46 +packaging==24.2 +pandas==2.2.3 +peft==0.11.1 +pillow==11.0.0 +platformdirs==4.3.6 +propcache==0.2.1 +protobuf==5.29.2 +psutil==6.1.1 +pyairports==2.1.1 +pyarrow==18.1.0 +pycountry==24.6.1 +pydantic==2.10.4 +pydantic_core==2.27.2 +Pygments==2.18.0 +pynvml==12.0.0 +python-dateutil==2.9.0.post0 +python-multipart==0.0.20 +pytz==2024.2 +PyYAML==6.0.2 +referencing==0.35.1 +regex==2024.11.6 +requests==2.32.3 +rich==13.9.4 +rpds-py==0.22.3 +safetensors==0.4.5 +sentencepiece==0.2.0 +setuptools==75.6.0 +shortuuid==1.0.13 +six==1.17.0 +sniffio==1.3.1 +starlette==0.41.3 +sympy==1.13.3 +termcolor==2.5.0 +tiktoken==0.8.0 +timm==1.0.12 +tokenizers==0.21.0 +# torch==2.4.0 +# torchaudio==2.4.0 +# torchvision==0.19.0 +tqdm==4.67.1 +transformers==4.47.1 +triton==3.0.0 +typing_extensions==4.12.2 +tzdata==2024.2 +urllib3==2.3.0 +uvicorn==0.34.0 +wheel==0.45.1 +xxhash==3.5.0 +yapf==0.43.0 +yarl==1.18.3 diff --git a/ui.py b/ui.py new file mode 100644 index 0000000..3263e18 --- /dev/null +++ b/ui.py @@ -0,0 +1,63 @@ +import gradio as gr +import asyncio +from endpoints.text import text_query +from endpoints.image import image_query +from endpoints.video import video_query + +def setup_ui(): + with gr.Blocks() as ui: + gr.Markdown( + """ + # Multimodal Query Interface + Submit text, image, or video queries and get insights powered by APIs. + """ + ) + + # Tabbed layout + with gr.Tabs(): + # Text Query Tab + with gr.Tab("Text Query"): + gr.Markdown("### Submit a Text Query") + with gr.Row(): + text_input = gr.Textbox(label="Your Question", placeholder="Type your question here...") + text_button = gr.Button("Submit") + text_output = gr.Textbox(label="Response", interactive=False) + text_button.click( + fn=lambda q: asyncio.run(text_query(q)), + inputs=[text_input], + outputs=[text_output] + ) + + # Image Query Tab + with gr.Tab("Image Query"): + gr.Markdown("### Submit an Image Query") + with gr.Row(): + image_input = gr.File(label="Upload Image") + image_question_input = gr.Textbox(label="Your Question", placeholder="Type your question here...") + image_button = gr.Button("Submit") + image_output = gr.Textbox(label="Response", interactive=False) + image_button.click( + fn=lambda img, q: asyncio.run(image_query(img, q)), + inputs=[image_input, image_question_input], + outputs=[image_output] + ) + + # Video Query Tab + with gr.Tab("Video Query"): + gr.Markdown("### Submit a Video Query") + with gr.Row(): + video_input = gr.File(label="Upload Video") + video_question_input = gr.Textbox(label="Your Question", placeholder="Type your question here...") + video_button = gr.Button("Submit") + video_output = gr.Textbox(label="Response", interactive=False) + video_button.click( + fn=lambda vid, q: asyncio.run(video_query(vid, q)), + inputs=[video_input, video_question_input], + outputs=[video_output] + ) + + return ui + +if __name__ == "__main__": + ui = setup_ui() + ui.launch(server_name="0.0.0.0", server_port=7860) \ No newline at end of file diff --git a/utils/__pycache__/audio_transcription.cpython-312.pyc b/utils/__pycache__/audio_transcription.cpython-312.pyc new file mode 100644 index 0000000..0165171 Binary files /dev/null and b/utils/__pycache__/audio_transcription.cpython-312.pyc differ diff --git a/utils/__pycache__/image_processing.cpython-312.pyc b/utils/__pycache__/image_processing.cpython-312.pyc new file mode 100644 index 0000000..139c246 Binary files /dev/null and b/utils/__pycache__/image_processing.cpython-312.pyc differ diff --git a/utils/__pycache__/video_processing.cpython-312.pyc b/utils/__pycache__/video_processing.cpython-312.pyc new file mode 100644 index 0000000..99900a4 Binary files /dev/null and b/utils/__pycache__/video_processing.cpython-312.pyc differ diff --git a/utils/audio_transcription.py b/utils/audio_transcription.py new file mode 100644 index 0000000..82d4705 --- /dev/null +++ b/utils/audio_transcription.py @@ -0,0 +1,13 @@ +from pydub import AudioSegment +from whisper import load_model + +def extract_audio_from_video(video_path: str) -> str: + audio = AudioSegment.from_file(video_path) + audio_path = "/tmp/temp_audio.wav" + audio.export(audio_path, format="wav") + return audio_path + +def transcribe_audio(audio_path: str) -> str: + model = load_model("base") + result = model.transcribe(audio_path) + return result["text"] diff --git a/utils/image_processing.py b/utils/image_processing.py new file mode 100644 index 0000000..3ce45e9 --- /dev/null +++ b/utils/image_processing.py @@ -0,0 +1,12 @@ +import io +import base64 +from PIL import Image + +def encode_image_base64(image: Image.Image) -> str: + """ + Encode a PIL Image to a Base64 string. + """ + buffered = io.BytesIO() + image.save(buffered, format="JPEG") + return base64.b64encode(buffered.getvalue()).decode() + diff --git a/utils/video_processing.py b/utils/video_processing.py new file mode 100644 index 0000000..28c18fb --- /dev/null +++ b/utils/video_processing.py @@ -0,0 +1,93 @@ +import cv2 +import os +import subprocess +import numpy as np +from PIL import Image +from pydub import AudioSegment +from decord import VideoReader, cpu +from concurrent.futures import ThreadPoolExecutor + +def split_video_into_segments(video_path, segment_duration=30): + """ + Splits a video into segments of a specified duration using FFmpeg. + """ + output_dir = "/tmp/video_segments" + os.makedirs(output_dir, exist_ok=True) + + # Calculate total duration of the video + cap = cv2.VideoCapture(video_path) + fps = int(cap.get(cv2.CAP_PROP_FPS)) + total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + total_duration = total_frames / fps + cap.release() + + segments = [] + for start_time in range(0, int(total_duration), segment_duration): + segment_file = os.path.join(output_dir, f"segment_{start_time}.mp4") + command = [ + "ffmpeg", "-i", video_path, + "-ss", str(start_time), + "-t", str(segment_duration), + "-c", "copy", segment_file + ] + subprocess.run(command, check=True) + segments.append(segment_file) + + return segments + +def extract_motion_key_frames(video_path, max_frames=20, sigma_multiplier=2, frame_interval=1): + """ + Extracts key frames from a video based on motion intensity. + """ + def calculate_motion(frame_pair): + """ + Calculates motion between two consecutive frames using optical flow. + """ + prev_gray, current_frame = frame_pair + current_gray = cv2.cvtColor(current_frame, cv2.COLOR_BGR2GRAY) + flow = cv2.calcOpticalFlowFarneback(prev_gray, current_gray, None, 0.5, 3, 15, 3, 5, 1.2, 0) + motion = np.sum(flow ** 2) + return motion, current_gray + + # Load video frames using Decord + video = VideoReader(video_path, ctx=cpu(0)) + frames_batch = video.get_batch(range(0, len(video), frame_interval)).asnumpy() + + # Resize frames for faster processing + frames = [cv2.resize(frame, (frame.shape[1] // 2, frame.shape[0] // 2)) for frame in frames_batch] + + # Initialize the first frame + prev_gray = cv2.cvtColor(frames[0], cv2.COLOR_BGR2GRAY) + frame_pairs = [(prev_gray, frames[i]) for i in range(1, len(frames))] + + # Calculate motion statistics + motion_values = [] + with ThreadPoolExecutor() as executor: + motion_results = list(executor.map(calculate_motion, frame_pairs)) + motion_values = [motion for motion, _ in motion_results] + + # Calculate threshold statistically + motion_mean = np.mean(motion_values) + motion_std = np.std(motion_values) + threshold = motion_mean + sigma_multiplier * motion_std + + # Extract key frames based on motion threshold + key_frames = [] + for i, (motion, frame) in enumerate(zip(motion_values, frames[1:])): + if motion > threshold and len(key_frames) < max_frames: + img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) + key_frames.append(img) + + return key_frames + +def extract_audio_from_video(video_path): + """ + Extract audio from video using pydub and save as a temporary audio file. + """ + print("Audio extraction started...") + audio = AudioSegment.from_file(video_path) + print("Audio extraction completed.") + audio_path = "/tmp/temp_audio.wav" + audio.export(audio_path, format="wav") + print(f"Audio extracted and saved to: {audio_path}") + return audio_path