updated ui

This commit is contained in:
Zixiao Wang 2025-01-23 21:50:55 +08:00
parent 0eb6bdc9c4
commit 31a7abea14
18 changed files with 675 additions and 1 deletions

2
.gitignore vendored
View File

@ -11,3 +11,5 @@
*.app *.app
.snapshots/* .snapshots/*
__pycache__/*
endpoints/__pycache__/*

31
Dockerfile Normal file
View File

@ -0,0 +1,31 @@
FROM python:3.12-slim
WORKDIR /home/ooin/st/app
RUN apt-get update && \
apt-get install -y --no-install-recommends \
build-essential \
libssl-dev \
libffi-dev \
libxml2-dev \
libxslt1-dev \
zlib1g-dev \
libjpeg-dev \
libopenblas-dev \
libopenmpi-dev \
&& rm -rf /var/lib/apt/lists/*
COPY requirements.txt .
RUN pip install --upgrade pip
RUN pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 flash-attn==2.7.2.post1 -f https://download.pytorch.org/whl/torch_stable.html
RUN pip install --no-cache-dir -r requirements.txt \
--index-url https://pypi.tuna.tsinghua.edu.cn/simple/ --timeout 100 --retries 5
COPY . .
EXPOSE 80
CMD ["uvicorn", "pipeline_UI3+audio:app", "--host", "0.0.0.0", "--port", "80"]

View File

@ -1,2 +1,2 @@
# tiktok_AI # api

Binary file not shown.

Binary file not shown.

53
endpoints/image.py Normal file
View File

@ -0,0 +1,53 @@
from fastapi import UploadFile, Form
from fastapi.responses import JSONResponse
import io
from PIL import Image
from pipeline_setup import pipe, IMAGE_TOKEN
from utils.image_processing import encode_image_base64
async def image_query(file: UploadFile, question: str = Form(...)):
"""
API endpoint to process an image with the user's query.
"""
try:
if file.content_type not in ["image/jpeg", "image/png"]:
return JSONResponse({"query": question, "error": "Unsupported file type."})
image_data = await file.read()
image = Image.open(io.BytesIO(image_data)).convert("RGB").resize((512, 512))
encoded_image_base64 = encode_image_base64(image)
question_with_image_token = f"{question}\n{IMAGE_TOKEN}"
response = await asyncio.to_thread(pipe, (question, image))
return JSONResponse({"query": question, "response": response.text})
except Exception as e:
return JSONResponse({"query": question, "error": str(e)})
# import mimetypes
# async def image_query(file: UploadFile, question: str = Form(...)):
# """
# API endpoint to process an image with the user's query.
# """
# try:
# # Get the file path from the UploadFile object
# file_path = file.filename
# # Determine the file type using the file extension
# file_type, _ = mimetypes.guess_type(file_path)
# if file_type not in ["image/jpeg", "image/png"]:
# return {"query": question, "error": "Unsupported file type."}
# # Read the image file
# image_data = await file.read()
# image = Image.open(io.BytesIO(image_data)).convert("RGB").resize((512, 512))
# encoded_image_base64 = encode_image_base64(image)
# # Prepare the query with the image token
# question_with_image_token = f"{question}\n{IMAGE_TOKEN}"
# # Query the model
# response = await asyncio.to_thread(pipe, (question, image))
# return {"query": question, "response": response.text}
# except Exception as e:
# return {"query": question, "error": str(e)}

24
endpoints/text.py Normal file
View File

@ -0,0 +1,24 @@
from fastapi import Form
from fastapi.responses import JSONResponse
from asyncio import to_thread
from pipeline_setup import pipe
async def text_query(question: str = Form(...)):
"""
API endpoint to process text input with the user's query.
"""
try:
response = await to_thread(pipe, question)
return JSONResponse({"query": question, "response": response.text})
except Exception as e:
return JSONResponse({"query": question, "error": str(e)})
# async def text_query(question: str = Form(...)):
# """
# API endpoint to process text input with the user's query.
# """
# try:
# response = await to_thread(pipe, question)
# return {"query": question, "response": response.text}
# except Exception as e:
# return {"query": question, "error": str(e)}

229
endpoints/video.py Normal file
View File

@ -0,0 +1,229 @@
from fastapi import UploadFile, Form
from fastapi.responses import JSONResponse
from pipeline_setup import pipe
from utils.image_processing import encode_image_base64
from utils.video_processing import split_video_into_segments, extract_motion_key_frames, extract_audio_from_video
from utils.audio_transcription import transcribe_audio
import asyncio
import time
from concurrent.futures import ThreadPoolExecutor
async def video_query(file: UploadFile, question: str = Form(...)):
"""
API endpoint to process a video file with the user's query.
"""
try:
print("Processing video...")
# Validate file type
if file.content_type not in ["video/mp4", "video/avi", "video/mkv"]:
return JSONResponse({"query": question, "error": "Unsupported video file type."})
# Start overall timer
overall_start_time = time.time()
# Save the uploaded video to a temporary file
print("Reading video...")
video_data = await file.read()
temp_video_path = "/tmp/temp_video.mp4"
with open(temp_video_path, "wb") as temp_video_file:
temp_video_file.write(video_data)
print(f"Temp video saved to: {temp_video_path}")
# Record the time after reading the video
video_reading_time = time.time()
# Split the video into segments
print("Splitting video...")
segments = split_video_into_segments(temp_video_path, segment_duration=30)
print(f"Video split into {len(segments)} segments.")
aggregated_responses = []
segment_timings = []
for i, segment_path in enumerate(segments):
print(f"Processing segment {i+1}/{len(segments)}: {segment_path}")
# Start timing for the segment
segment_start_time = time.time()
# Extract key frames
frame_start_time = time.time()
imgs = extract_motion_key_frames(segment_path, max_frames=50, sigma_multiplier=2)
frame_time = time.time()
# Extract audio and transcribe
audio_start_time = time.time()
audio_path = extract_audio_from_video(segment_path)
transcribed_text = transcribe_audio(audio_path)
audio_time = time.time()
# Combine transcribed text with the query
combined_query = f"Audio Transcript: {transcribed_text}\n{question}"
# Prepare content for the pipeline
question_with_frames = ""
for j, img in enumerate(imgs):
question_with_frames += f"Frame{j+1}: {{IMAGE_TOKEN}}\n"
question_with_frames += combined_query
content = [{"type": "text", "text": question_with_frames}]
for img in imgs:
content.append({
"type": "image_url",
"image_url": {
"max_dynamic_patch": 1,
"url": f"data:image/jpeg;base64,{encode_image_base64(img)}"
}
})
# Query the model
inference_start_time = time.time()
messages = [dict(role="user", content=content)]
response = await asyncio.to_thread(pipe, messages)
inference_time = time.time()
# Aggregate response
aggregated_responses.append(response.text)
# Calculate timing for the segment
segment_timings.append({
"segment_index": i + 1,
"segment_processing_time": inference_time - segment_start_time,
"frame_extraction_time": frame_time - frame_start_time,
"audio_extraction_time": audio_time - audio_start_time,
"model_inference_time": inference_time - inference_start_time
})
print(f"transcription: {transcribed_text}")
# print(f"content: {content}")
overall_end_time = time.time()
# Aggregate total timings
total_timings = {
"video_reading_time": video_reading_time - overall_start_time,
"total_segments": len(segments),
"total_processing_time": overall_end_time - overall_start_time,
"segment_details": segment_timings
}
return JSONResponse({
"question": question,
"responses": aggregated_responses,
"timings": total_timings,
})
except Exception as e:
return JSONResponse({"query": question, "error": str(e)})
# async def video_query(file: UploadFile, question: str = Form(...)):
# """
# API endpoint to process a video file with the user's query.
# """
# try:
# print("Processing video...")
# # Get the file path from the UploadFile object
# file_path = file.filename
# # Determine the file type using the file extension
# file_type, _ = mimetypes.guess_type(file_path)
# if file_type is None or not file_type.startswith("video/"):
# return {"query": question, "error": "Unsupported video file type."}
# # Start overall timer
# overall_start_time = time.time()
# # Save the uploaded video to a temporary file
# print("Reading video...")
# video_data = await file.read()
# temp_video_path = "/tmp/temp_video.mp4"
# with open(temp_video_path, "wb") as temp_video_file:
# temp_video_file.write(video_data)
# print(f"Temp video saved to: {temp_video_path}")
# # Record the time after reading the video
# video_reading_time = time.time()
# # Split the video into segments
# print("Splitting video...")
# segments = split_video_into_segments(temp_video_path, segment_duration=30)
# print(f"Video split into {len(segments)} segments.")
# aggregated_responses = []
# segment_timings = []
# for i, segment_path in enumerate(segments):
# print(f"Processing segment {i+1}/{len(segments)}: {segment_path}")
# # Start timing for the segment
# segment_start_time = time.time()
# # Extract key frames
# frame_start_time = time.time()
# imgs = extract_motion_key_frames(segment_path, max_frames=50, sigma_multiplier=2)
# frame_time = time.time()
# # Extract audio and transcribe
# audio_start_time = time.time()
# audio_path = extract_audio_from_video(segment_path)
# transcribed_text = transcribe_audio(audio_path)
# audio_time = time.time()
# # Combine transcribed text with the query
# combined_query = f"Audio Transcript: {transcribed_text}\n{question}"
# # Prepare content for the pipeline
# question_with_frames = ""
# for j, img in enumerate(imgs):
# question_with_frames += f"Frame{j+1}: {{IMAGE_TOKEN}}\n"
# question_with_frames += combined_query
# content = [{"type": "text", "text": question_with_frames}]
# for img in imgs:
# content.append({
# "type": "image_url",
# "image_url": {
# "max_dynamic_patch": 1,
# "url": f"data:image/jpeg;base64,{encode_image_base64(img)}"
# }
# })
# # Query the model
# inference_start_time = time.time()
# messages = [dict(role="user", content=content)]
# response = await asyncio.to_thread(pipe, messages)
# inference_time = time.time()
# # Aggregate response
# aggregated_responses.append(response.text)
# # Calculate timing for the segment
# segment_timings.append({
# "segment_index": i + 1,
# "segment_processing_time": inference_time - segment_start_time,
# "frame_extraction_time": frame_time - frame_start_time,
# "audio_extraction_time": audio_time - audio_start_time,
# "model_inference_time": inference_time - inference_start_time
# })
# print(f"transcription: {transcribed_text}")
# overall_end_time = time.time()
# # Aggregate total timings
# total_timings = {
# "video_reading_time": video_reading_time - overall_start_time,
# "total_segments": len(segments),
# "total_processing_time": overall_end_time - overall_start_time,
# "segment_details": segment_timings
# }
# return {
# "question": question,
# "responses": aggregated_responses,
# "timings": total_timings,
# }
# except Exception as e:
# return {"query": question, "error": str(e)}

18
main.py Normal file
View File

@ -0,0 +1,18 @@
from fastapi import FastAPI
from endpoints.text import text_query
from endpoints.image import image_query
from endpoints.video import video_query
app = FastAPI()
# Register routes
app.post("/api/text")(text_query)
app.post("/api/image")(image_query)
app.post("/api/video")(video_query)
if __name__ == "__main__":
import uvicorn
uvicorn.run("main:app", host="0.0.0.0", port=8080, reload=True)
# python main.py
# uvicorn main:app --reload

19
pipeline_setup.py Normal file
View File

@ -0,0 +1,19 @@
from lmdeploy import pipeline, TurbomindEngineConfig, GenerationConfig
# Constants
IMAGE_TOKEN = "[IMAGE_TOKEN]"
# Model initialization
model = "OpenGVLab/InternVL2-26B-AWQ"
pipe = pipeline(
model,
backend_config=TurbomindEngineConfig(
model_format="awq",
tp=4,
session_len=12864,
max_batch_size=1,
cache_max_entry_count=0.05,
cache_block_seq_len=32768,
quant_policy=4
)
)

117
requirements.txt Normal file
View File

@ -0,0 +1,117 @@
accelerate==1.2.1
addict==2.4.0
aiohappyeyeballs==2.4.4
aiohttp==3.11.11
aiosignal==1.3.2
annotated-types==0.7.0
anyio==4.7.0
attrs==24.3.0
bitsandbytes==0.45.0
certifi==2024.12.14
charset-normalizer==3.4.0
click==8.1.8
cloudpickle==3.1.0
datasets==3.2.0
decord==0.6.0
dill==0.3.8
diskcache==5.6.3
distro==1.9.0
einops==0.8.0
fastapi==0.115.6
filelock==3.16.1
fire==0.7.0
# flash-attn==2.7.2.post1
frozenlist==1.5.0
fsspec==2024.9.0
h11==0.14.0
httpcore==1.0.7
httpx==0.28.1
huggingface-hub==0.27.0
idna==3.10
interegular==0.3.3
Jinja2==3.1.5
jiter==0.8.2
jsonschema==4.23.0
jsonschema-specifications==2024.10.1
lark==1.2.2
llvmlite==0.43.0
lmdeploy==0.6.4
markdown-it-py==3.0.0
MarkupSafe==3.0.2
mdurl==0.1.2
mmengine-lite==0.10.5
modelscope==1.21.0
mpmath==1.3.0
multidict==6.1.0
multiprocess==0.70.16
nest-asyncio==1.6.0
networkx==3.4.2
ninja==1.11.1.3
numba==0.60.0
numpy==1.26.4
nvidia-cublas-cu12==12.1.3.1
nvidia-cuda-cupti-cu12==12.1.105
nvidia-cuda-nvrtc-cu12==12.1.105
nvidia-cuda-runtime-cu12==12.1.105
nvidia-cudnn-cu12==9.1.0.70
nvidia-cufft-cu12==11.0.2.54
nvidia-curand-cu12==10.3.2.106
nvidia-cusolver-cu12==11.4.5.107
nvidia-cusparse-cu12==12.1.0.106
nvidia-ml-py==12.560.30
nvidia-nccl-cu12==2.20.5
nvidia-nvjitlink-cu12==12.6.85
nvidia-nvtx-cu12==12.1.105
openai==1.58.1
opencv-python==4.10.0.84
outlines==0.0.46
packaging==24.2
pandas==2.2.3
peft==0.11.1
pillow==11.0.0
platformdirs==4.3.6
propcache==0.2.1
protobuf==5.29.2
psutil==6.1.1
pyairports==2.1.1
pyarrow==18.1.0
pycountry==24.6.1
pydantic==2.10.4
pydantic_core==2.27.2
Pygments==2.18.0
pynvml==12.0.0
python-dateutil==2.9.0.post0
python-multipart==0.0.20
pytz==2024.2
PyYAML==6.0.2
referencing==0.35.1
regex==2024.11.6
requests==2.32.3
rich==13.9.4
rpds-py==0.22.3
safetensors==0.4.5
sentencepiece==0.2.0
setuptools==75.6.0
shortuuid==1.0.13
six==1.17.0
sniffio==1.3.1
starlette==0.41.3
sympy==1.13.3
termcolor==2.5.0
tiktoken==0.8.0
timm==1.0.12
tokenizers==0.21.0
# torch==2.4.0
# torchaudio==2.4.0
# torchvision==0.19.0
tqdm==4.67.1
transformers==4.47.1
triton==3.0.0
typing_extensions==4.12.2
tzdata==2024.2
urllib3==2.3.0
uvicorn==0.34.0
wheel==0.45.1
xxhash==3.5.0
yapf==0.43.0
yarl==1.18.3

63
ui.py Normal file
View File

@ -0,0 +1,63 @@
import gradio as gr
import asyncio
from endpoints.text import text_query
from endpoints.image import image_query
from endpoints.video import video_query
def setup_ui():
with gr.Blocks() as ui:
gr.Markdown(
"""
# Multimodal Query Interface
Submit text, image, or video queries and get insights powered by APIs.
"""
)
# Tabbed layout
with gr.Tabs():
# Text Query Tab
with gr.Tab("Text Query"):
gr.Markdown("### Submit a Text Query")
with gr.Row():
text_input = gr.Textbox(label="Your Question", placeholder="Type your question here...")
text_button = gr.Button("Submit")
text_output = gr.Textbox(label="Response", interactive=False)
text_button.click(
fn=lambda q: asyncio.run(text_query(q)),
inputs=[text_input],
outputs=[text_output]
)
# Image Query Tab
with gr.Tab("Image Query"):
gr.Markdown("### Submit an Image Query")
with gr.Row():
image_input = gr.File(label="Upload Image")
image_question_input = gr.Textbox(label="Your Question", placeholder="Type your question here...")
image_button = gr.Button("Submit")
image_output = gr.Textbox(label="Response", interactive=False)
image_button.click(
fn=lambda img, q: asyncio.run(image_query(img, q)),
inputs=[image_input, image_question_input],
outputs=[image_output]
)
# Video Query Tab
with gr.Tab("Video Query"):
gr.Markdown("### Submit a Video Query")
with gr.Row():
video_input = gr.File(label="Upload Video")
video_question_input = gr.Textbox(label="Your Question", placeholder="Type your question here...")
video_button = gr.Button("Submit")
video_output = gr.Textbox(label="Response", interactive=False)
video_button.click(
fn=lambda vid, q: asyncio.run(video_query(vid, q)),
inputs=[video_input, video_question_input],
outputs=[video_output]
)
return ui
if __name__ == "__main__":
ui = setup_ui()
ui.launch(server_name="0.0.0.0", server_port=7860)

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,13 @@
from pydub import AudioSegment
from whisper import load_model
def extract_audio_from_video(video_path: str) -> str:
audio = AudioSegment.from_file(video_path)
audio_path = "/tmp/temp_audio.wav"
audio.export(audio_path, format="wav")
return audio_path
def transcribe_audio(audio_path: str) -> str:
model = load_model("base")
result = model.transcribe(audio_path)
return result["text"]

12
utils/image_processing.py Normal file
View File

@ -0,0 +1,12 @@
import io
import base64
from PIL import Image
def encode_image_base64(image: Image.Image) -> str:
"""
Encode a PIL Image to a Base64 string.
"""
buffered = io.BytesIO()
image.save(buffered, format="JPEG")
return base64.b64encode(buffered.getvalue()).decode()

93
utils/video_processing.py Normal file
View File

@ -0,0 +1,93 @@
import cv2
import os
import subprocess
import numpy as np
from PIL import Image
from pydub import AudioSegment
from decord import VideoReader, cpu
from concurrent.futures import ThreadPoolExecutor
def split_video_into_segments(video_path, segment_duration=30):
"""
Splits a video into segments of a specified duration using FFmpeg.
"""
output_dir = "/tmp/video_segments"
os.makedirs(output_dir, exist_ok=True)
# Calculate total duration of the video
cap = cv2.VideoCapture(video_path)
fps = int(cap.get(cv2.CAP_PROP_FPS))
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
total_duration = total_frames / fps
cap.release()
segments = []
for start_time in range(0, int(total_duration), segment_duration):
segment_file = os.path.join(output_dir, f"segment_{start_time}.mp4")
command = [
"ffmpeg", "-i", video_path,
"-ss", str(start_time),
"-t", str(segment_duration),
"-c", "copy", segment_file
]
subprocess.run(command, check=True)
segments.append(segment_file)
return segments
def extract_motion_key_frames(video_path, max_frames=20, sigma_multiplier=2, frame_interval=1):
"""
Extracts key frames from a video based on motion intensity.
"""
def calculate_motion(frame_pair):
"""
Calculates motion between two consecutive frames using optical flow.
"""
prev_gray, current_frame = frame_pair
current_gray = cv2.cvtColor(current_frame, cv2.COLOR_BGR2GRAY)
flow = cv2.calcOpticalFlowFarneback(prev_gray, current_gray, None, 0.5, 3, 15, 3, 5, 1.2, 0)
motion = np.sum(flow ** 2)
return motion, current_gray
# Load video frames using Decord
video = VideoReader(video_path, ctx=cpu(0))
frames_batch = video.get_batch(range(0, len(video), frame_interval)).asnumpy()
# Resize frames for faster processing
frames = [cv2.resize(frame, (frame.shape[1] // 2, frame.shape[0] // 2)) for frame in frames_batch]
# Initialize the first frame
prev_gray = cv2.cvtColor(frames[0], cv2.COLOR_BGR2GRAY)
frame_pairs = [(prev_gray, frames[i]) for i in range(1, len(frames))]
# Calculate motion statistics
motion_values = []
with ThreadPoolExecutor() as executor:
motion_results = list(executor.map(calculate_motion, frame_pairs))
motion_values = [motion for motion, _ in motion_results]
# Calculate threshold statistically
motion_mean = np.mean(motion_values)
motion_std = np.std(motion_values)
threshold = motion_mean + sigma_multiplier * motion_std
# Extract key frames based on motion threshold
key_frames = []
for i, (motion, frame) in enumerate(zip(motion_values, frames[1:])):
if motion > threshold and len(key_frames) < max_frames:
img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
key_frames.append(img)
return key_frames
def extract_audio_from_video(video_path):
"""
Extract audio from video using pydub and save as a temporary audio file.
"""
print("Audio extraction started...")
audio = AudioSegment.from_file(video_path)
print("Audio extraction completed.")
audio_path = "/tmp/temp_audio.wav"
audio.export(audio_path, format="wav")
print(f"Audio extracted and saved to: {audio_path}")
return audio_path