updated ui
This commit is contained in:
parent
0eb6bdc9c4
commit
31a7abea14
2
.gitignore
vendored
2
.gitignore
vendored
@ -11,3 +11,5 @@
|
|||||||
*.app
|
*.app
|
||||||
.snapshots/*
|
.snapshots/*
|
||||||
|
|
||||||
|
__pycache__/*
|
||||||
|
endpoints/__pycache__/*
|
31
Dockerfile
Normal file
31
Dockerfile
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
FROM python:3.12-slim
|
||||||
|
|
||||||
|
WORKDIR /home/ooin/st/app
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y --no-install-recommends \
|
||||||
|
build-essential \
|
||||||
|
libssl-dev \
|
||||||
|
libffi-dev \
|
||||||
|
libxml2-dev \
|
||||||
|
libxslt1-dev \
|
||||||
|
zlib1g-dev \
|
||||||
|
libjpeg-dev \
|
||||||
|
libopenblas-dev \
|
||||||
|
libopenmpi-dev \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
COPY requirements.txt .
|
||||||
|
|
||||||
|
RUN pip install --upgrade pip
|
||||||
|
|
||||||
|
RUN pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 flash-attn==2.7.2.post1 -f https://download.pytorch.org/whl/torch_stable.html
|
||||||
|
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt \
|
||||||
|
--index-url https://pypi.tuna.tsinghua.edu.cn/simple/ --timeout 100 --retries 5
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
EXPOSE 80
|
||||||
|
|
||||||
|
CMD ["uvicorn", "pipeline_UI3+audio:app", "--host", "0.0.0.0", "--port", "80"]
|
BIN
__pycache__/main.cpython-312.pyc
Normal file
BIN
__pycache__/main.cpython-312.pyc
Normal file
Binary file not shown.
BIN
__pycache__/pipeline_setup.cpython-312.pyc
Normal file
BIN
__pycache__/pipeline_setup.cpython-312.pyc
Normal file
Binary file not shown.
53
endpoints/image.py
Normal file
53
endpoints/image.py
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
from fastapi import UploadFile, Form
|
||||||
|
from fastapi.responses import JSONResponse
|
||||||
|
import io
|
||||||
|
from PIL import Image
|
||||||
|
from pipeline_setup import pipe, IMAGE_TOKEN
|
||||||
|
from utils.image_processing import encode_image_base64
|
||||||
|
|
||||||
|
async def image_query(file: UploadFile, question: str = Form(...)):
|
||||||
|
"""
|
||||||
|
API endpoint to process an image with the user's query.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
if file.content_type not in ["image/jpeg", "image/png"]:
|
||||||
|
return JSONResponse({"query": question, "error": "Unsupported file type."})
|
||||||
|
|
||||||
|
image_data = await file.read()
|
||||||
|
image = Image.open(io.BytesIO(image_data)).convert("RGB").resize((512, 512))
|
||||||
|
encoded_image_base64 = encode_image_base64(image)
|
||||||
|
|
||||||
|
question_with_image_token = f"{question}\n{IMAGE_TOKEN}"
|
||||||
|
response = await asyncio.to_thread(pipe, (question, image))
|
||||||
|
return JSONResponse({"query": question, "response": response.text})
|
||||||
|
except Exception as e:
|
||||||
|
return JSONResponse({"query": question, "error": str(e)})
|
||||||
|
|
||||||
|
|
||||||
|
# import mimetypes
|
||||||
|
# async def image_query(file: UploadFile, question: str = Form(...)):
|
||||||
|
# """
|
||||||
|
# API endpoint to process an image with the user's query.
|
||||||
|
# """
|
||||||
|
# try:
|
||||||
|
# # Get the file path from the UploadFile object
|
||||||
|
# file_path = file.filename
|
||||||
|
|
||||||
|
# # Determine the file type using the file extension
|
||||||
|
# file_type, _ = mimetypes.guess_type(file_path)
|
||||||
|
# if file_type not in ["image/jpeg", "image/png"]:
|
||||||
|
# return {"query": question, "error": "Unsupported file type."}
|
||||||
|
|
||||||
|
# # Read the image file
|
||||||
|
# image_data = await file.read()
|
||||||
|
# image = Image.open(io.BytesIO(image_data)).convert("RGB").resize((512, 512))
|
||||||
|
# encoded_image_base64 = encode_image_base64(image)
|
||||||
|
|
||||||
|
# # Prepare the query with the image token
|
||||||
|
# question_with_image_token = f"{question}\n{IMAGE_TOKEN}"
|
||||||
|
|
||||||
|
# # Query the model
|
||||||
|
# response = await asyncio.to_thread(pipe, (question, image))
|
||||||
|
# return {"query": question, "response": response.text}
|
||||||
|
# except Exception as e:
|
||||||
|
# return {"query": question, "error": str(e)}
|
24
endpoints/text.py
Normal file
24
endpoints/text.py
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
from fastapi import Form
|
||||||
|
from fastapi.responses import JSONResponse
|
||||||
|
from asyncio import to_thread
|
||||||
|
from pipeline_setup import pipe
|
||||||
|
|
||||||
|
async def text_query(question: str = Form(...)):
|
||||||
|
"""
|
||||||
|
API endpoint to process text input with the user's query.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
response = await to_thread(pipe, question)
|
||||||
|
return JSONResponse({"query": question, "response": response.text})
|
||||||
|
except Exception as e:
|
||||||
|
return JSONResponse({"query": question, "error": str(e)})
|
||||||
|
|
||||||
|
# async def text_query(question: str = Form(...)):
|
||||||
|
# """
|
||||||
|
# API endpoint to process text input with the user's query.
|
||||||
|
# """
|
||||||
|
# try:
|
||||||
|
# response = await to_thread(pipe, question)
|
||||||
|
# return {"query": question, "response": response.text}
|
||||||
|
# except Exception as e:
|
||||||
|
# return {"query": question, "error": str(e)}
|
229
endpoints/video.py
Normal file
229
endpoints/video.py
Normal file
@ -0,0 +1,229 @@
|
|||||||
|
from fastapi import UploadFile, Form
|
||||||
|
from fastapi.responses import JSONResponse
|
||||||
|
from pipeline_setup import pipe
|
||||||
|
from utils.image_processing import encode_image_base64
|
||||||
|
from utils.video_processing import split_video_into_segments, extract_motion_key_frames, extract_audio_from_video
|
||||||
|
from utils.audio_transcription import transcribe_audio
|
||||||
|
import asyncio
|
||||||
|
import time
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
|
||||||
|
async def video_query(file: UploadFile, question: str = Form(...)):
|
||||||
|
"""
|
||||||
|
API endpoint to process a video file with the user's query.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
print("Processing video...")
|
||||||
|
|
||||||
|
# Validate file type
|
||||||
|
if file.content_type not in ["video/mp4", "video/avi", "video/mkv"]:
|
||||||
|
return JSONResponse({"query": question, "error": "Unsupported video file type."})
|
||||||
|
|
||||||
|
# Start overall timer
|
||||||
|
overall_start_time = time.time()
|
||||||
|
|
||||||
|
# Save the uploaded video to a temporary file
|
||||||
|
print("Reading video...")
|
||||||
|
video_data = await file.read()
|
||||||
|
temp_video_path = "/tmp/temp_video.mp4"
|
||||||
|
with open(temp_video_path, "wb") as temp_video_file:
|
||||||
|
temp_video_file.write(video_data)
|
||||||
|
print(f"Temp video saved to: {temp_video_path}")
|
||||||
|
|
||||||
|
# Record the time after reading the video
|
||||||
|
video_reading_time = time.time()
|
||||||
|
|
||||||
|
# Split the video into segments
|
||||||
|
print("Splitting video...")
|
||||||
|
segments = split_video_into_segments(temp_video_path, segment_duration=30)
|
||||||
|
print(f"Video split into {len(segments)} segments.")
|
||||||
|
|
||||||
|
aggregated_responses = []
|
||||||
|
segment_timings = []
|
||||||
|
|
||||||
|
for i, segment_path in enumerate(segments):
|
||||||
|
print(f"Processing segment {i+1}/{len(segments)}: {segment_path}")
|
||||||
|
|
||||||
|
# Start timing for the segment
|
||||||
|
segment_start_time = time.time()
|
||||||
|
|
||||||
|
# Extract key frames
|
||||||
|
frame_start_time = time.time()
|
||||||
|
imgs = extract_motion_key_frames(segment_path, max_frames=50, sigma_multiplier=2)
|
||||||
|
frame_time = time.time()
|
||||||
|
|
||||||
|
# Extract audio and transcribe
|
||||||
|
audio_start_time = time.time()
|
||||||
|
audio_path = extract_audio_from_video(segment_path)
|
||||||
|
transcribed_text = transcribe_audio(audio_path)
|
||||||
|
audio_time = time.time()
|
||||||
|
|
||||||
|
# Combine transcribed text with the query
|
||||||
|
combined_query = f"Audio Transcript: {transcribed_text}\n{question}"
|
||||||
|
|
||||||
|
# Prepare content for the pipeline
|
||||||
|
question_with_frames = ""
|
||||||
|
for j, img in enumerate(imgs):
|
||||||
|
question_with_frames += f"Frame{j+1}: {{IMAGE_TOKEN}}\n"
|
||||||
|
question_with_frames += combined_query
|
||||||
|
|
||||||
|
content = [{"type": "text", "text": question_with_frames}]
|
||||||
|
for img in imgs:
|
||||||
|
content.append({
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {
|
||||||
|
"max_dynamic_patch": 1,
|
||||||
|
"url": f"data:image/jpeg;base64,{encode_image_base64(img)}"
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
# Query the model
|
||||||
|
inference_start_time = time.time()
|
||||||
|
messages = [dict(role="user", content=content)]
|
||||||
|
response = await asyncio.to_thread(pipe, messages)
|
||||||
|
inference_time = time.time()
|
||||||
|
|
||||||
|
# Aggregate response
|
||||||
|
aggregated_responses.append(response.text)
|
||||||
|
|
||||||
|
# Calculate timing for the segment
|
||||||
|
segment_timings.append({
|
||||||
|
"segment_index": i + 1,
|
||||||
|
"segment_processing_time": inference_time - segment_start_time,
|
||||||
|
"frame_extraction_time": frame_time - frame_start_time,
|
||||||
|
"audio_extraction_time": audio_time - audio_start_time,
|
||||||
|
"model_inference_time": inference_time - inference_start_time
|
||||||
|
})
|
||||||
|
|
||||||
|
print(f"transcription: {transcribed_text}")
|
||||||
|
# print(f"content: {content}")
|
||||||
|
|
||||||
|
overall_end_time = time.time()
|
||||||
|
|
||||||
|
# Aggregate total timings
|
||||||
|
total_timings = {
|
||||||
|
"video_reading_time": video_reading_time - overall_start_time,
|
||||||
|
"total_segments": len(segments),
|
||||||
|
"total_processing_time": overall_end_time - overall_start_time,
|
||||||
|
"segment_details": segment_timings
|
||||||
|
}
|
||||||
|
|
||||||
|
return JSONResponse({
|
||||||
|
"question": question,
|
||||||
|
"responses": aggregated_responses,
|
||||||
|
"timings": total_timings,
|
||||||
|
})
|
||||||
|
except Exception as e:
|
||||||
|
return JSONResponse({"query": question, "error": str(e)})
|
||||||
|
|
||||||
|
|
||||||
|
# async def video_query(file: UploadFile, question: str = Form(...)):
|
||||||
|
# """
|
||||||
|
# API endpoint to process a video file with the user's query.
|
||||||
|
# """
|
||||||
|
# try:
|
||||||
|
# print("Processing video...")
|
||||||
|
|
||||||
|
# # Get the file path from the UploadFile object
|
||||||
|
# file_path = file.filename
|
||||||
|
|
||||||
|
# # Determine the file type using the file extension
|
||||||
|
# file_type, _ = mimetypes.guess_type(file_path)
|
||||||
|
# if file_type is None or not file_type.startswith("video/"):
|
||||||
|
# return {"query": question, "error": "Unsupported video file type."}
|
||||||
|
|
||||||
|
# # Start overall timer
|
||||||
|
# overall_start_time = time.time()
|
||||||
|
|
||||||
|
# # Save the uploaded video to a temporary file
|
||||||
|
# print("Reading video...")
|
||||||
|
# video_data = await file.read()
|
||||||
|
# temp_video_path = "/tmp/temp_video.mp4"
|
||||||
|
# with open(temp_video_path, "wb") as temp_video_file:
|
||||||
|
# temp_video_file.write(video_data)
|
||||||
|
# print(f"Temp video saved to: {temp_video_path}")
|
||||||
|
|
||||||
|
# # Record the time after reading the video
|
||||||
|
# video_reading_time = time.time()
|
||||||
|
|
||||||
|
# # Split the video into segments
|
||||||
|
# print("Splitting video...")
|
||||||
|
# segments = split_video_into_segments(temp_video_path, segment_duration=30)
|
||||||
|
# print(f"Video split into {len(segments)} segments.")
|
||||||
|
|
||||||
|
# aggregated_responses = []
|
||||||
|
# segment_timings = []
|
||||||
|
|
||||||
|
# for i, segment_path in enumerate(segments):
|
||||||
|
# print(f"Processing segment {i+1}/{len(segments)}: {segment_path}")
|
||||||
|
|
||||||
|
# # Start timing for the segment
|
||||||
|
# segment_start_time = time.time()
|
||||||
|
|
||||||
|
# # Extract key frames
|
||||||
|
# frame_start_time = time.time()
|
||||||
|
# imgs = extract_motion_key_frames(segment_path, max_frames=50, sigma_multiplier=2)
|
||||||
|
# frame_time = time.time()
|
||||||
|
|
||||||
|
# # Extract audio and transcribe
|
||||||
|
# audio_start_time = time.time()
|
||||||
|
# audio_path = extract_audio_from_video(segment_path)
|
||||||
|
# transcribed_text = transcribe_audio(audio_path)
|
||||||
|
# audio_time = time.time()
|
||||||
|
|
||||||
|
# # Combine transcribed text with the query
|
||||||
|
# combined_query = f"Audio Transcript: {transcribed_text}\n{question}"
|
||||||
|
|
||||||
|
# # Prepare content for the pipeline
|
||||||
|
# question_with_frames = ""
|
||||||
|
# for j, img in enumerate(imgs):
|
||||||
|
# question_with_frames += f"Frame{j+1}: {{IMAGE_TOKEN}}\n"
|
||||||
|
# question_with_frames += combined_query
|
||||||
|
|
||||||
|
# content = [{"type": "text", "text": question_with_frames}]
|
||||||
|
# for img in imgs:
|
||||||
|
# content.append({
|
||||||
|
# "type": "image_url",
|
||||||
|
# "image_url": {
|
||||||
|
# "max_dynamic_patch": 1,
|
||||||
|
# "url": f"data:image/jpeg;base64,{encode_image_base64(img)}"
|
||||||
|
# }
|
||||||
|
# })
|
||||||
|
|
||||||
|
# # Query the model
|
||||||
|
# inference_start_time = time.time()
|
||||||
|
# messages = [dict(role="user", content=content)]
|
||||||
|
# response = await asyncio.to_thread(pipe, messages)
|
||||||
|
# inference_time = time.time()
|
||||||
|
|
||||||
|
# # Aggregate response
|
||||||
|
# aggregated_responses.append(response.text)
|
||||||
|
|
||||||
|
# # Calculate timing for the segment
|
||||||
|
# segment_timings.append({
|
||||||
|
# "segment_index": i + 1,
|
||||||
|
# "segment_processing_time": inference_time - segment_start_time,
|
||||||
|
# "frame_extraction_time": frame_time - frame_start_time,
|
||||||
|
# "audio_extraction_time": audio_time - audio_start_time,
|
||||||
|
# "model_inference_time": inference_time - inference_start_time
|
||||||
|
# })
|
||||||
|
|
||||||
|
# print(f"transcription: {transcribed_text}")
|
||||||
|
|
||||||
|
# overall_end_time = time.time()
|
||||||
|
|
||||||
|
# # Aggregate total timings
|
||||||
|
# total_timings = {
|
||||||
|
# "video_reading_time": video_reading_time - overall_start_time,
|
||||||
|
# "total_segments": len(segments),
|
||||||
|
# "total_processing_time": overall_end_time - overall_start_time,
|
||||||
|
# "segment_details": segment_timings
|
||||||
|
# }
|
||||||
|
|
||||||
|
# return {
|
||||||
|
# "question": question,
|
||||||
|
# "responses": aggregated_responses,
|
||||||
|
# "timings": total_timings,
|
||||||
|
# }
|
||||||
|
# except Exception as e:
|
||||||
|
# return {"query": question, "error": str(e)}
|
18
main.py
Normal file
18
main.py
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
from fastapi import FastAPI
|
||||||
|
from endpoints.text import text_query
|
||||||
|
from endpoints.image import image_query
|
||||||
|
from endpoints.video import video_query
|
||||||
|
|
||||||
|
app = FastAPI()
|
||||||
|
|
||||||
|
# Register routes
|
||||||
|
app.post("/api/text")(text_query)
|
||||||
|
app.post("/api/image")(image_query)
|
||||||
|
app.post("/api/video")(video_query)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import uvicorn
|
||||||
|
uvicorn.run("main:app", host="0.0.0.0", port=8080, reload=True)
|
||||||
|
|
||||||
|
# python main.py
|
||||||
|
# uvicorn main:app --reload
|
19
pipeline_setup.py
Normal file
19
pipeline_setup.py
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
from lmdeploy import pipeline, TurbomindEngineConfig, GenerationConfig
|
||||||
|
|
||||||
|
# Constants
|
||||||
|
IMAGE_TOKEN = "[IMAGE_TOKEN]"
|
||||||
|
|
||||||
|
# Model initialization
|
||||||
|
model = "OpenGVLab/InternVL2-26B-AWQ"
|
||||||
|
pipe = pipeline(
|
||||||
|
model,
|
||||||
|
backend_config=TurbomindEngineConfig(
|
||||||
|
model_format="awq",
|
||||||
|
tp=4,
|
||||||
|
session_len=12864,
|
||||||
|
max_batch_size=1,
|
||||||
|
cache_max_entry_count=0.05,
|
||||||
|
cache_block_seq_len=32768,
|
||||||
|
quant_policy=4
|
||||||
|
)
|
||||||
|
)
|
117
requirements.txt
Normal file
117
requirements.txt
Normal file
@ -0,0 +1,117 @@
|
|||||||
|
accelerate==1.2.1
|
||||||
|
addict==2.4.0
|
||||||
|
aiohappyeyeballs==2.4.4
|
||||||
|
aiohttp==3.11.11
|
||||||
|
aiosignal==1.3.2
|
||||||
|
annotated-types==0.7.0
|
||||||
|
anyio==4.7.0
|
||||||
|
attrs==24.3.0
|
||||||
|
bitsandbytes==0.45.0
|
||||||
|
certifi==2024.12.14
|
||||||
|
charset-normalizer==3.4.0
|
||||||
|
click==8.1.8
|
||||||
|
cloudpickle==3.1.0
|
||||||
|
datasets==3.2.0
|
||||||
|
decord==0.6.0
|
||||||
|
dill==0.3.8
|
||||||
|
diskcache==5.6.3
|
||||||
|
distro==1.9.0
|
||||||
|
einops==0.8.0
|
||||||
|
fastapi==0.115.6
|
||||||
|
filelock==3.16.1
|
||||||
|
fire==0.7.0
|
||||||
|
# flash-attn==2.7.2.post1
|
||||||
|
frozenlist==1.5.0
|
||||||
|
fsspec==2024.9.0
|
||||||
|
h11==0.14.0
|
||||||
|
httpcore==1.0.7
|
||||||
|
httpx==0.28.1
|
||||||
|
huggingface-hub==0.27.0
|
||||||
|
idna==3.10
|
||||||
|
interegular==0.3.3
|
||||||
|
Jinja2==3.1.5
|
||||||
|
jiter==0.8.2
|
||||||
|
jsonschema==4.23.0
|
||||||
|
jsonschema-specifications==2024.10.1
|
||||||
|
lark==1.2.2
|
||||||
|
llvmlite==0.43.0
|
||||||
|
lmdeploy==0.6.4
|
||||||
|
markdown-it-py==3.0.0
|
||||||
|
MarkupSafe==3.0.2
|
||||||
|
mdurl==0.1.2
|
||||||
|
mmengine-lite==0.10.5
|
||||||
|
modelscope==1.21.0
|
||||||
|
mpmath==1.3.0
|
||||||
|
multidict==6.1.0
|
||||||
|
multiprocess==0.70.16
|
||||||
|
nest-asyncio==1.6.0
|
||||||
|
networkx==3.4.2
|
||||||
|
ninja==1.11.1.3
|
||||||
|
numba==0.60.0
|
||||||
|
numpy==1.26.4
|
||||||
|
nvidia-cublas-cu12==12.1.3.1
|
||||||
|
nvidia-cuda-cupti-cu12==12.1.105
|
||||||
|
nvidia-cuda-nvrtc-cu12==12.1.105
|
||||||
|
nvidia-cuda-runtime-cu12==12.1.105
|
||||||
|
nvidia-cudnn-cu12==9.1.0.70
|
||||||
|
nvidia-cufft-cu12==11.0.2.54
|
||||||
|
nvidia-curand-cu12==10.3.2.106
|
||||||
|
nvidia-cusolver-cu12==11.4.5.107
|
||||||
|
nvidia-cusparse-cu12==12.1.0.106
|
||||||
|
nvidia-ml-py==12.560.30
|
||||||
|
nvidia-nccl-cu12==2.20.5
|
||||||
|
nvidia-nvjitlink-cu12==12.6.85
|
||||||
|
nvidia-nvtx-cu12==12.1.105
|
||||||
|
openai==1.58.1
|
||||||
|
opencv-python==4.10.0.84
|
||||||
|
outlines==0.0.46
|
||||||
|
packaging==24.2
|
||||||
|
pandas==2.2.3
|
||||||
|
peft==0.11.1
|
||||||
|
pillow==11.0.0
|
||||||
|
platformdirs==4.3.6
|
||||||
|
propcache==0.2.1
|
||||||
|
protobuf==5.29.2
|
||||||
|
psutil==6.1.1
|
||||||
|
pyairports==2.1.1
|
||||||
|
pyarrow==18.1.0
|
||||||
|
pycountry==24.6.1
|
||||||
|
pydantic==2.10.4
|
||||||
|
pydantic_core==2.27.2
|
||||||
|
Pygments==2.18.0
|
||||||
|
pynvml==12.0.0
|
||||||
|
python-dateutil==2.9.0.post0
|
||||||
|
python-multipart==0.0.20
|
||||||
|
pytz==2024.2
|
||||||
|
PyYAML==6.0.2
|
||||||
|
referencing==0.35.1
|
||||||
|
regex==2024.11.6
|
||||||
|
requests==2.32.3
|
||||||
|
rich==13.9.4
|
||||||
|
rpds-py==0.22.3
|
||||||
|
safetensors==0.4.5
|
||||||
|
sentencepiece==0.2.0
|
||||||
|
setuptools==75.6.0
|
||||||
|
shortuuid==1.0.13
|
||||||
|
six==1.17.0
|
||||||
|
sniffio==1.3.1
|
||||||
|
starlette==0.41.3
|
||||||
|
sympy==1.13.3
|
||||||
|
termcolor==2.5.0
|
||||||
|
tiktoken==0.8.0
|
||||||
|
timm==1.0.12
|
||||||
|
tokenizers==0.21.0
|
||||||
|
# torch==2.4.0
|
||||||
|
# torchaudio==2.4.0
|
||||||
|
# torchvision==0.19.0
|
||||||
|
tqdm==4.67.1
|
||||||
|
transformers==4.47.1
|
||||||
|
triton==3.0.0
|
||||||
|
typing_extensions==4.12.2
|
||||||
|
tzdata==2024.2
|
||||||
|
urllib3==2.3.0
|
||||||
|
uvicorn==0.34.0
|
||||||
|
wheel==0.45.1
|
||||||
|
xxhash==3.5.0
|
||||||
|
yapf==0.43.0
|
||||||
|
yarl==1.18.3
|
63
ui.py
Normal file
63
ui.py
Normal file
@ -0,0 +1,63 @@
|
|||||||
|
import gradio as gr
|
||||||
|
import asyncio
|
||||||
|
from endpoints.text import text_query
|
||||||
|
from endpoints.image import image_query
|
||||||
|
from endpoints.video import video_query
|
||||||
|
|
||||||
|
def setup_ui():
|
||||||
|
with gr.Blocks() as ui:
|
||||||
|
gr.Markdown(
|
||||||
|
"""
|
||||||
|
# Multimodal Query Interface
|
||||||
|
Submit text, image, or video queries and get insights powered by APIs.
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
|
||||||
|
# Tabbed layout
|
||||||
|
with gr.Tabs():
|
||||||
|
# Text Query Tab
|
||||||
|
with gr.Tab("Text Query"):
|
||||||
|
gr.Markdown("### Submit a Text Query")
|
||||||
|
with gr.Row():
|
||||||
|
text_input = gr.Textbox(label="Your Question", placeholder="Type your question here...")
|
||||||
|
text_button = gr.Button("Submit")
|
||||||
|
text_output = gr.Textbox(label="Response", interactive=False)
|
||||||
|
text_button.click(
|
||||||
|
fn=lambda q: asyncio.run(text_query(q)),
|
||||||
|
inputs=[text_input],
|
||||||
|
outputs=[text_output]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Image Query Tab
|
||||||
|
with gr.Tab("Image Query"):
|
||||||
|
gr.Markdown("### Submit an Image Query")
|
||||||
|
with gr.Row():
|
||||||
|
image_input = gr.File(label="Upload Image")
|
||||||
|
image_question_input = gr.Textbox(label="Your Question", placeholder="Type your question here...")
|
||||||
|
image_button = gr.Button("Submit")
|
||||||
|
image_output = gr.Textbox(label="Response", interactive=False)
|
||||||
|
image_button.click(
|
||||||
|
fn=lambda img, q: asyncio.run(image_query(img, q)),
|
||||||
|
inputs=[image_input, image_question_input],
|
||||||
|
outputs=[image_output]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Video Query Tab
|
||||||
|
with gr.Tab("Video Query"):
|
||||||
|
gr.Markdown("### Submit a Video Query")
|
||||||
|
with gr.Row():
|
||||||
|
video_input = gr.File(label="Upload Video")
|
||||||
|
video_question_input = gr.Textbox(label="Your Question", placeholder="Type your question here...")
|
||||||
|
video_button = gr.Button("Submit")
|
||||||
|
video_output = gr.Textbox(label="Response", interactive=False)
|
||||||
|
video_button.click(
|
||||||
|
fn=lambda vid, q: asyncio.run(video_query(vid, q)),
|
||||||
|
inputs=[video_input, video_question_input],
|
||||||
|
outputs=[video_output]
|
||||||
|
)
|
||||||
|
|
||||||
|
return ui
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
ui = setup_ui()
|
||||||
|
ui.launch(server_name="0.0.0.0", server_port=7860)
|
BIN
utils/__pycache__/audio_transcription.cpython-312.pyc
Normal file
BIN
utils/__pycache__/audio_transcription.cpython-312.pyc
Normal file
Binary file not shown.
BIN
utils/__pycache__/image_processing.cpython-312.pyc
Normal file
BIN
utils/__pycache__/image_processing.cpython-312.pyc
Normal file
Binary file not shown.
BIN
utils/__pycache__/video_processing.cpython-312.pyc
Normal file
BIN
utils/__pycache__/video_processing.cpython-312.pyc
Normal file
Binary file not shown.
13
utils/audio_transcription.py
Normal file
13
utils/audio_transcription.py
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
from pydub import AudioSegment
|
||||||
|
from whisper import load_model
|
||||||
|
|
||||||
|
def extract_audio_from_video(video_path: str) -> str:
|
||||||
|
audio = AudioSegment.from_file(video_path)
|
||||||
|
audio_path = "/tmp/temp_audio.wav"
|
||||||
|
audio.export(audio_path, format="wav")
|
||||||
|
return audio_path
|
||||||
|
|
||||||
|
def transcribe_audio(audio_path: str) -> str:
|
||||||
|
model = load_model("base")
|
||||||
|
result = model.transcribe(audio_path)
|
||||||
|
return result["text"]
|
12
utils/image_processing.py
Normal file
12
utils/image_processing.py
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
import io
|
||||||
|
import base64
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
def encode_image_base64(image: Image.Image) -> str:
|
||||||
|
"""
|
||||||
|
Encode a PIL Image to a Base64 string.
|
||||||
|
"""
|
||||||
|
buffered = io.BytesIO()
|
||||||
|
image.save(buffered, format="JPEG")
|
||||||
|
return base64.b64encode(buffered.getvalue()).decode()
|
||||||
|
|
93
utils/video_processing.py
Normal file
93
utils/video_processing.py
Normal file
@ -0,0 +1,93 @@
|
|||||||
|
import cv2
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
import numpy as np
|
||||||
|
from PIL import Image
|
||||||
|
from pydub import AudioSegment
|
||||||
|
from decord import VideoReader, cpu
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
|
||||||
|
def split_video_into_segments(video_path, segment_duration=30):
|
||||||
|
"""
|
||||||
|
Splits a video into segments of a specified duration using FFmpeg.
|
||||||
|
"""
|
||||||
|
output_dir = "/tmp/video_segments"
|
||||||
|
os.makedirs(output_dir, exist_ok=True)
|
||||||
|
|
||||||
|
# Calculate total duration of the video
|
||||||
|
cap = cv2.VideoCapture(video_path)
|
||||||
|
fps = int(cap.get(cv2.CAP_PROP_FPS))
|
||||||
|
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
||||||
|
total_duration = total_frames / fps
|
||||||
|
cap.release()
|
||||||
|
|
||||||
|
segments = []
|
||||||
|
for start_time in range(0, int(total_duration), segment_duration):
|
||||||
|
segment_file = os.path.join(output_dir, f"segment_{start_time}.mp4")
|
||||||
|
command = [
|
||||||
|
"ffmpeg", "-i", video_path,
|
||||||
|
"-ss", str(start_time),
|
||||||
|
"-t", str(segment_duration),
|
||||||
|
"-c", "copy", segment_file
|
||||||
|
]
|
||||||
|
subprocess.run(command, check=True)
|
||||||
|
segments.append(segment_file)
|
||||||
|
|
||||||
|
return segments
|
||||||
|
|
||||||
|
def extract_motion_key_frames(video_path, max_frames=20, sigma_multiplier=2, frame_interval=1):
|
||||||
|
"""
|
||||||
|
Extracts key frames from a video based on motion intensity.
|
||||||
|
"""
|
||||||
|
def calculate_motion(frame_pair):
|
||||||
|
"""
|
||||||
|
Calculates motion between two consecutive frames using optical flow.
|
||||||
|
"""
|
||||||
|
prev_gray, current_frame = frame_pair
|
||||||
|
current_gray = cv2.cvtColor(current_frame, cv2.COLOR_BGR2GRAY)
|
||||||
|
flow = cv2.calcOpticalFlowFarneback(prev_gray, current_gray, None, 0.5, 3, 15, 3, 5, 1.2, 0)
|
||||||
|
motion = np.sum(flow ** 2)
|
||||||
|
return motion, current_gray
|
||||||
|
|
||||||
|
# Load video frames using Decord
|
||||||
|
video = VideoReader(video_path, ctx=cpu(0))
|
||||||
|
frames_batch = video.get_batch(range(0, len(video), frame_interval)).asnumpy()
|
||||||
|
|
||||||
|
# Resize frames for faster processing
|
||||||
|
frames = [cv2.resize(frame, (frame.shape[1] // 2, frame.shape[0] // 2)) for frame in frames_batch]
|
||||||
|
|
||||||
|
# Initialize the first frame
|
||||||
|
prev_gray = cv2.cvtColor(frames[0], cv2.COLOR_BGR2GRAY)
|
||||||
|
frame_pairs = [(prev_gray, frames[i]) for i in range(1, len(frames))]
|
||||||
|
|
||||||
|
# Calculate motion statistics
|
||||||
|
motion_values = []
|
||||||
|
with ThreadPoolExecutor() as executor:
|
||||||
|
motion_results = list(executor.map(calculate_motion, frame_pairs))
|
||||||
|
motion_values = [motion for motion, _ in motion_results]
|
||||||
|
|
||||||
|
# Calculate threshold statistically
|
||||||
|
motion_mean = np.mean(motion_values)
|
||||||
|
motion_std = np.std(motion_values)
|
||||||
|
threshold = motion_mean + sigma_multiplier * motion_std
|
||||||
|
|
||||||
|
# Extract key frames based on motion threshold
|
||||||
|
key_frames = []
|
||||||
|
for i, (motion, frame) in enumerate(zip(motion_values, frames[1:])):
|
||||||
|
if motion > threshold and len(key_frames) < max_frames:
|
||||||
|
img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
|
||||||
|
key_frames.append(img)
|
||||||
|
|
||||||
|
return key_frames
|
||||||
|
|
||||||
|
def extract_audio_from_video(video_path):
|
||||||
|
"""
|
||||||
|
Extract audio from video using pydub and save as a temporary audio file.
|
||||||
|
"""
|
||||||
|
print("Audio extraction started...")
|
||||||
|
audio = AudioSegment.from_file(video_path)
|
||||||
|
print("Audio extraction completed.")
|
||||||
|
audio_path = "/tmp/temp_audio.wav"
|
||||||
|
audio.export(audio_path, format="wav")
|
||||||
|
print(f"Audio extracted and saved to: {audio_path}")
|
||||||
|
return audio_path
|
Loading…
Reference in New Issue
Block a user