updated ui
This commit is contained in:
parent
0eb6bdc9c4
commit
31a7abea14
2
.gitignore
vendored
2
.gitignore
vendored
@ -11,3 +11,5 @@
|
||||
*.app
|
||||
.snapshots/*
|
||||
|
||||
__pycache__/*
|
||||
endpoints/__pycache__/*
|
31
Dockerfile
Normal file
31
Dockerfile
Normal file
@ -0,0 +1,31 @@
|
||||
FROM python:3.12-slim
|
||||
|
||||
WORKDIR /home/ooin/st/app
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
libssl-dev \
|
||||
libffi-dev \
|
||||
libxml2-dev \
|
||||
libxslt1-dev \
|
||||
zlib1g-dev \
|
||||
libjpeg-dev \
|
||||
libopenblas-dev \
|
||||
libopenmpi-dev \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
COPY requirements.txt .
|
||||
|
||||
RUN pip install --upgrade pip
|
||||
|
||||
RUN pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 flash-attn==2.7.2.post1 -f https://download.pytorch.org/whl/torch_stable.html
|
||||
|
||||
RUN pip install --no-cache-dir -r requirements.txt \
|
||||
--index-url https://pypi.tuna.tsinghua.edu.cn/simple/ --timeout 100 --retries 5
|
||||
|
||||
COPY . .
|
||||
|
||||
EXPOSE 80
|
||||
|
||||
CMD ["uvicorn", "pipeline_UI3+audio:app", "--host", "0.0.0.0", "--port", "80"]
|
BIN
__pycache__/main.cpython-312.pyc
Normal file
BIN
__pycache__/main.cpython-312.pyc
Normal file
Binary file not shown.
BIN
__pycache__/pipeline_setup.cpython-312.pyc
Normal file
BIN
__pycache__/pipeline_setup.cpython-312.pyc
Normal file
Binary file not shown.
53
endpoints/image.py
Normal file
53
endpoints/image.py
Normal file
@ -0,0 +1,53 @@
|
||||
from fastapi import UploadFile, Form
|
||||
from fastapi.responses import JSONResponse
|
||||
import io
|
||||
from PIL import Image
|
||||
from pipeline_setup import pipe, IMAGE_TOKEN
|
||||
from utils.image_processing import encode_image_base64
|
||||
|
||||
async def image_query(file: UploadFile, question: str = Form(...)):
|
||||
"""
|
||||
API endpoint to process an image with the user's query.
|
||||
"""
|
||||
try:
|
||||
if file.content_type not in ["image/jpeg", "image/png"]:
|
||||
return JSONResponse({"query": question, "error": "Unsupported file type."})
|
||||
|
||||
image_data = await file.read()
|
||||
image = Image.open(io.BytesIO(image_data)).convert("RGB").resize((512, 512))
|
||||
encoded_image_base64 = encode_image_base64(image)
|
||||
|
||||
question_with_image_token = f"{question}\n{IMAGE_TOKEN}"
|
||||
response = await asyncio.to_thread(pipe, (question, image))
|
||||
return JSONResponse({"query": question, "response": response.text})
|
||||
except Exception as e:
|
||||
return JSONResponse({"query": question, "error": str(e)})
|
||||
|
||||
|
||||
# import mimetypes
|
||||
# async def image_query(file: UploadFile, question: str = Form(...)):
|
||||
# """
|
||||
# API endpoint to process an image with the user's query.
|
||||
# """
|
||||
# try:
|
||||
# # Get the file path from the UploadFile object
|
||||
# file_path = file.filename
|
||||
|
||||
# # Determine the file type using the file extension
|
||||
# file_type, _ = mimetypes.guess_type(file_path)
|
||||
# if file_type not in ["image/jpeg", "image/png"]:
|
||||
# return {"query": question, "error": "Unsupported file type."}
|
||||
|
||||
# # Read the image file
|
||||
# image_data = await file.read()
|
||||
# image = Image.open(io.BytesIO(image_data)).convert("RGB").resize((512, 512))
|
||||
# encoded_image_base64 = encode_image_base64(image)
|
||||
|
||||
# # Prepare the query with the image token
|
||||
# question_with_image_token = f"{question}\n{IMAGE_TOKEN}"
|
||||
|
||||
# # Query the model
|
||||
# response = await asyncio.to_thread(pipe, (question, image))
|
||||
# return {"query": question, "response": response.text}
|
||||
# except Exception as e:
|
||||
# return {"query": question, "error": str(e)}
|
24
endpoints/text.py
Normal file
24
endpoints/text.py
Normal file
@ -0,0 +1,24 @@
|
||||
from fastapi import Form
|
||||
from fastapi.responses import JSONResponse
|
||||
from asyncio import to_thread
|
||||
from pipeline_setup import pipe
|
||||
|
||||
async def text_query(question: str = Form(...)):
|
||||
"""
|
||||
API endpoint to process text input with the user's query.
|
||||
"""
|
||||
try:
|
||||
response = await to_thread(pipe, question)
|
||||
return JSONResponse({"query": question, "response": response.text})
|
||||
except Exception as e:
|
||||
return JSONResponse({"query": question, "error": str(e)})
|
||||
|
||||
# async def text_query(question: str = Form(...)):
|
||||
# """
|
||||
# API endpoint to process text input with the user's query.
|
||||
# """
|
||||
# try:
|
||||
# response = await to_thread(pipe, question)
|
||||
# return {"query": question, "response": response.text}
|
||||
# except Exception as e:
|
||||
# return {"query": question, "error": str(e)}
|
229
endpoints/video.py
Normal file
229
endpoints/video.py
Normal file
@ -0,0 +1,229 @@
|
||||
from fastapi import UploadFile, Form
|
||||
from fastapi.responses import JSONResponse
|
||||
from pipeline_setup import pipe
|
||||
from utils.image_processing import encode_image_base64
|
||||
from utils.video_processing import split_video_into_segments, extract_motion_key_frames, extract_audio_from_video
|
||||
from utils.audio_transcription import transcribe_audio
|
||||
import asyncio
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
async def video_query(file: UploadFile, question: str = Form(...)):
|
||||
"""
|
||||
API endpoint to process a video file with the user's query.
|
||||
"""
|
||||
try:
|
||||
print("Processing video...")
|
||||
|
||||
# Validate file type
|
||||
if file.content_type not in ["video/mp4", "video/avi", "video/mkv"]:
|
||||
return JSONResponse({"query": question, "error": "Unsupported video file type."})
|
||||
|
||||
# Start overall timer
|
||||
overall_start_time = time.time()
|
||||
|
||||
# Save the uploaded video to a temporary file
|
||||
print("Reading video...")
|
||||
video_data = await file.read()
|
||||
temp_video_path = "/tmp/temp_video.mp4"
|
||||
with open(temp_video_path, "wb") as temp_video_file:
|
||||
temp_video_file.write(video_data)
|
||||
print(f"Temp video saved to: {temp_video_path}")
|
||||
|
||||
# Record the time after reading the video
|
||||
video_reading_time = time.time()
|
||||
|
||||
# Split the video into segments
|
||||
print("Splitting video...")
|
||||
segments = split_video_into_segments(temp_video_path, segment_duration=30)
|
||||
print(f"Video split into {len(segments)} segments.")
|
||||
|
||||
aggregated_responses = []
|
||||
segment_timings = []
|
||||
|
||||
for i, segment_path in enumerate(segments):
|
||||
print(f"Processing segment {i+1}/{len(segments)}: {segment_path}")
|
||||
|
||||
# Start timing for the segment
|
||||
segment_start_time = time.time()
|
||||
|
||||
# Extract key frames
|
||||
frame_start_time = time.time()
|
||||
imgs = extract_motion_key_frames(segment_path, max_frames=50, sigma_multiplier=2)
|
||||
frame_time = time.time()
|
||||
|
||||
# Extract audio and transcribe
|
||||
audio_start_time = time.time()
|
||||
audio_path = extract_audio_from_video(segment_path)
|
||||
transcribed_text = transcribe_audio(audio_path)
|
||||
audio_time = time.time()
|
||||
|
||||
# Combine transcribed text with the query
|
||||
combined_query = f"Audio Transcript: {transcribed_text}\n{question}"
|
||||
|
||||
# Prepare content for the pipeline
|
||||
question_with_frames = ""
|
||||
for j, img in enumerate(imgs):
|
||||
question_with_frames += f"Frame{j+1}: {{IMAGE_TOKEN}}\n"
|
||||
question_with_frames += combined_query
|
||||
|
||||
content = [{"type": "text", "text": question_with_frames}]
|
||||
for img in imgs:
|
||||
content.append({
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"max_dynamic_patch": 1,
|
||||
"url": f"data:image/jpeg;base64,{encode_image_base64(img)}"
|
||||
}
|
||||
})
|
||||
|
||||
# Query the model
|
||||
inference_start_time = time.time()
|
||||
messages = [dict(role="user", content=content)]
|
||||
response = await asyncio.to_thread(pipe, messages)
|
||||
inference_time = time.time()
|
||||
|
||||
# Aggregate response
|
||||
aggregated_responses.append(response.text)
|
||||
|
||||
# Calculate timing for the segment
|
||||
segment_timings.append({
|
||||
"segment_index": i + 1,
|
||||
"segment_processing_time": inference_time - segment_start_time,
|
||||
"frame_extraction_time": frame_time - frame_start_time,
|
||||
"audio_extraction_time": audio_time - audio_start_time,
|
||||
"model_inference_time": inference_time - inference_start_time
|
||||
})
|
||||
|
||||
print(f"transcription: {transcribed_text}")
|
||||
# print(f"content: {content}")
|
||||
|
||||
overall_end_time = time.time()
|
||||
|
||||
# Aggregate total timings
|
||||
total_timings = {
|
||||
"video_reading_time": video_reading_time - overall_start_time,
|
||||
"total_segments": len(segments),
|
||||
"total_processing_time": overall_end_time - overall_start_time,
|
||||
"segment_details": segment_timings
|
||||
}
|
||||
|
||||
return JSONResponse({
|
||||
"question": question,
|
||||
"responses": aggregated_responses,
|
||||
"timings": total_timings,
|
||||
})
|
||||
except Exception as e:
|
||||
return JSONResponse({"query": question, "error": str(e)})
|
||||
|
||||
|
||||
# async def video_query(file: UploadFile, question: str = Form(...)):
|
||||
# """
|
||||
# API endpoint to process a video file with the user's query.
|
||||
# """
|
||||
# try:
|
||||
# print("Processing video...")
|
||||
|
||||
# # Get the file path from the UploadFile object
|
||||
# file_path = file.filename
|
||||
|
||||
# # Determine the file type using the file extension
|
||||
# file_type, _ = mimetypes.guess_type(file_path)
|
||||
# if file_type is None or not file_type.startswith("video/"):
|
||||
# return {"query": question, "error": "Unsupported video file type."}
|
||||
|
||||
# # Start overall timer
|
||||
# overall_start_time = time.time()
|
||||
|
||||
# # Save the uploaded video to a temporary file
|
||||
# print("Reading video...")
|
||||
# video_data = await file.read()
|
||||
# temp_video_path = "/tmp/temp_video.mp4"
|
||||
# with open(temp_video_path, "wb") as temp_video_file:
|
||||
# temp_video_file.write(video_data)
|
||||
# print(f"Temp video saved to: {temp_video_path}")
|
||||
|
||||
# # Record the time after reading the video
|
||||
# video_reading_time = time.time()
|
||||
|
||||
# # Split the video into segments
|
||||
# print("Splitting video...")
|
||||
# segments = split_video_into_segments(temp_video_path, segment_duration=30)
|
||||
# print(f"Video split into {len(segments)} segments.")
|
||||
|
||||
# aggregated_responses = []
|
||||
# segment_timings = []
|
||||
|
||||
# for i, segment_path in enumerate(segments):
|
||||
# print(f"Processing segment {i+1}/{len(segments)}: {segment_path}")
|
||||
|
||||
# # Start timing for the segment
|
||||
# segment_start_time = time.time()
|
||||
|
||||
# # Extract key frames
|
||||
# frame_start_time = time.time()
|
||||
# imgs = extract_motion_key_frames(segment_path, max_frames=50, sigma_multiplier=2)
|
||||
# frame_time = time.time()
|
||||
|
||||
# # Extract audio and transcribe
|
||||
# audio_start_time = time.time()
|
||||
# audio_path = extract_audio_from_video(segment_path)
|
||||
# transcribed_text = transcribe_audio(audio_path)
|
||||
# audio_time = time.time()
|
||||
|
||||
# # Combine transcribed text with the query
|
||||
# combined_query = f"Audio Transcript: {transcribed_text}\n{question}"
|
||||
|
||||
# # Prepare content for the pipeline
|
||||
# question_with_frames = ""
|
||||
# for j, img in enumerate(imgs):
|
||||
# question_with_frames += f"Frame{j+1}: {{IMAGE_TOKEN}}\n"
|
||||
# question_with_frames += combined_query
|
||||
|
||||
# content = [{"type": "text", "text": question_with_frames}]
|
||||
# for img in imgs:
|
||||
# content.append({
|
||||
# "type": "image_url",
|
||||
# "image_url": {
|
||||
# "max_dynamic_patch": 1,
|
||||
# "url": f"data:image/jpeg;base64,{encode_image_base64(img)}"
|
||||
# }
|
||||
# })
|
||||
|
||||
# # Query the model
|
||||
# inference_start_time = time.time()
|
||||
# messages = [dict(role="user", content=content)]
|
||||
# response = await asyncio.to_thread(pipe, messages)
|
||||
# inference_time = time.time()
|
||||
|
||||
# # Aggregate response
|
||||
# aggregated_responses.append(response.text)
|
||||
|
||||
# # Calculate timing for the segment
|
||||
# segment_timings.append({
|
||||
# "segment_index": i + 1,
|
||||
# "segment_processing_time": inference_time - segment_start_time,
|
||||
# "frame_extraction_time": frame_time - frame_start_time,
|
||||
# "audio_extraction_time": audio_time - audio_start_time,
|
||||
# "model_inference_time": inference_time - inference_start_time
|
||||
# })
|
||||
|
||||
# print(f"transcription: {transcribed_text}")
|
||||
|
||||
# overall_end_time = time.time()
|
||||
|
||||
# # Aggregate total timings
|
||||
# total_timings = {
|
||||
# "video_reading_time": video_reading_time - overall_start_time,
|
||||
# "total_segments": len(segments),
|
||||
# "total_processing_time": overall_end_time - overall_start_time,
|
||||
# "segment_details": segment_timings
|
||||
# }
|
||||
|
||||
# return {
|
||||
# "question": question,
|
||||
# "responses": aggregated_responses,
|
||||
# "timings": total_timings,
|
||||
# }
|
||||
# except Exception as e:
|
||||
# return {"query": question, "error": str(e)}
|
18
main.py
Normal file
18
main.py
Normal file
@ -0,0 +1,18 @@
|
||||
from fastapi import FastAPI
|
||||
from endpoints.text import text_query
|
||||
from endpoints.image import image_query
|
||||
from endpoints.video import video_query
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
# Register routes
|
||||
app.post("/api/text")(text_query)
|
||||
app.post("/api/image")(image_query)
|
||||
app.post("/api/video")(video_query)
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
uvicorn.run("main:app", host="0.0.0.0", port=8080, reload=True)
|
||||
|
||||
# python main.py
|
||||
# uvicorn main:app --reload
|
19
pipeline_setup.py
Normal file
19
pipeline_setup.py
Normal file
@ -0,0 +1,19 @@
|
||||
from lmdeploy import pipeline, TurbomindEngineConfig, GenerationConfig
|
||||
|
||||
# Constants
|
||||
IMAGE_TOKEN = "[IMAGE_TOKEN]"
|
||||
|
||||
# Model initialization
|
||||
model = "OpenGVLab/InternVL2-26B-AWQ"
|
||||
pipe = pipeline(
|
||||
model,
|
||||
backend_config=TurbomindEngineConfig(
|
||||
model_format="awq",
|
||||
tp=4,
|
||||
session_len=12864,
|
||||
max_batch_size=1,
|
||||
cache_max_entry_count=0.05,
|
||||
cache_block_seq_len=32768,
|
||||
quant_policy=4
|
||||
)
|
||||
)
|
117
requirements.txt
Normal file
117
requirements.txt
Normal file
@ -0,0 +1,117 @@
|
||||
accelerate==1.2.1
|
||||
addict==2.4.0
|
||||
aiohappyeyeballs==2.4.4
|
||||
aiohttp==3.11.11
|
||||
aiosignal==1.3.2
|
||||
annotated-types==0.7.0
|
||||
anyio==4.7.0
|
||||
attrs==24.3.0
|
||||
bitsandbytes==0.45.0
|
||||
certifi==2024.12.14
|
||||
charset-normalizer==3.4.0
|
||||
click==8.1.8
|
||||
cloudpickle==3.1.0
|
||||
datasets==3.2.0
|
||||
decord==0.6.0
|
||||
dill==0.3.8
|
||||
diskcache==5.6.3
|
||||
distro==1.9.0
|
||||
einops==0.8.0
|
||||
fastapi==0.115.6
|
||||
filelock==3.16.1
|
||||
fire==0.7.0
|
||||
# flash-attn==2.7.2.post1
|
||||
frozenlist==1.5.0
|
||||
fsspec==2024.9.0
|
||||
h11==0.14.0
|
||||
httpcore==1.0.7
|
||||
httpx==0.28.1
|
||||
huggingface-hub==0.27.0
|
||||
idna==3.10
|
||||
interegular==0.3.3
|
||||
Jinja2==3.1.5
|
||||
jiter==0.8.2
|
||||
jsonschema==4.23.0
|
||||
jsonschema-specifications==2024.10.1
|
||||
lark==1.2.2
|
||||
llvmlite==0.43.0
|
||||
lmdeploy==0.6.4
|
||||
markdown-it-py==3.0.0
|
||||
MarkupSafe==3.0.2
|
||||
mdurl==0.1.2
|
||||
mmengine-lite==0.10.5
|
||||
modelscope==1.21.0
|
||||
mpmath==1.3.0
|
||||
multidict==6.1.0
|
||||
multiprocess==0.70.16
|
||||
nest-asyncio==1.6.0
|
||||
networkx==3.4.2
|
||||
ninja==1.11.1.3
|
||||
numba==0.60.0
|
||||
numpy==1.26.4
|
||||
nvidia-cublas-cu12==12.1.3.1
|
||||
nvidia-cuda-cupti-cu12==12.1.105
|
||||
nvidia-cuda-nvrtc-cu12==12.1.105
|
||||
nvidia-cuda-runtime-cu12==12.1.105
|
||||
nvidia-cudnn-cu12==9.1.0.70
|
||||
nvidia-cufft-cu12==11.0.2.54
|
||||
nvidia-curand-cu12==10.3.2.106
|
||||
nvidia-cusolver-cu12==11.4.5.107
|
||||
nvidia-cusparse-cu12==12.1.0.106
|
||||
nvidia-ml-py==12.560.30
|
||||
nvidia-nccl-cu12==2.20.5
|
||||
nvidia-nvjitlink-cu12==12.6.85
|
||||
nvidia-nvtx-cu12==12.1.105
|
||||
openai==1.58.1
|
||||
opencv-python==4.10.0.84
|
||||
outlines==0.0.46
|
||||
packaging==24.2
|
||||
pandas==2.2.3
|
||||
peft==0.11.1
|
||||
pillow==11.0.0
|
||||
platformdirs==4.3.6
|
||||
propcache==0.2.1
|
||||
protobuf==5.29.2
|
||||
psutil==6.1.1
|
||||
pyairports==2.1.1
|
||||
pyarrow==18.1.0
|
||||
pycountry==24.6.1
|
||||
pydantic==2.10.4
|
||||
pydantic_core==2.27.2
|
||||
Pygments==2.18.0
|
||||
pynvml==12.0.0
|
||||
python-dateutil==2.9.0.post0
|
||||
python-multipart==0.0.20
|
||||
pytz==2024.2
|
||||
PyYAML==6.0.2
|
||||
referencing==0.35.1
|
||||
regex==2024.11.6
|
||||
requests==2.32.3
|
||||
rich==13.9.4
|
||||
rpds-py==0.22.3
|
||||
safetensors==0.4.5
|
||||
sentencepiece==0.2.0
|
||||
setuptools==75.6.0
|
||||
shortuuid==1.0.13
|
||||
six==1.17.0
|
||||
sniffio==1.3.1
|
||||
starlette==0.41.3
|
||||
sympy==1.13.3
|
||||
termcolor==2.5.0
|
||||
tiktoken==0.8.0
|
||||
timm==1.0.12
|
||||
tokenizers==0.21.0
|
||||
# torch==2.4.0
|
||||
# torchaudio==2.4.0
|
||||
# torchvision==0.19.0
|
||||
tqdm==4.67.1
|
||||
transformers==4.47.1
|
||||
triton==3.0.0
|
||||
typing_extensions==4.12.2
|
||||
tzdata==2024.2
|
||||
urllib3==2.3.0
|
||||
uvicorn==0.34.0
|
||||
wheel==0.45.1
|
||||
xxhash==3.5.0
|
||||
yapf==0.43.0
|
||||
yarl==1.18.3
|
63
ui.py
Normal file
63
ui.py
Normal file
@ -0,0 +1,63 @@
|
||||
import gradio as gr
|
||||
import asyncio
|
||||
from endpoints.text import text_query
|
||||
from endpoints.image import image_query
|
||||
from endpoints.video import video_query
|
||||
|
||||
def setup_ui():
|
||||
with gr.Blocks() as ui:
|
||||
gr.Markdown(
|
||||
"""
|
||||
# Multimodal Query Interface
|
||||
Submit text, image, or video queries and get insights powered by APIs.
|
||||
"""
|
||||
)
|
||||
|
||||
# Tabbed layout
|
||||
with gr.Tabs():
|
||||
# Text Query Tab
|
||||
with gr.Tab("Text Query"):
|
||||
gr.Markdown("### Submit a Text Query")
|
||||
with gr.Row():
|
||||
text_input = gr.Textbox(label="Your Question", placeholder="Type your question here...")
|
||||
text_button = gr.Button("Submit")
|
||||
text_output = gr.Textbox(label="Response", interactive=False)
|
||||
text_button.click(
|
||||
fn=lambda q: asyncio.run(text_query(q)),
|
||||
inputs=[text_input],
|
||||
outputs=[text_output]
|
||||
)
|
||||
|
||||
# Image Query Tab
|
||||
with gr.Tab("Image Query"):
|
||||
gr.Markdown("### Submit an Image Query")
|
||||
with gr.Row():
|
||||
image_input = gr.File(label="Upload Image")
|
||||
image_question_input = gr.Textbox(label="Your Question", placeholder="Type your question here...")
|
||||
image_button = gr.Button("Submit")
|
||||
image_output = gr.Textbox(label="Response", interactive=False)
|
||||
image_button.click(
|
||||
fn=lambda img, q: asyncio.run(image_query(img, q)),
|
||||
inputs=[image_input, image_question_input],
|
||||
outputs=[image_output]
|
||||
)
|
||||
|
||||
# Video Query Tab
|
||||
with gr.Tab("Video Query"):
|
||||
gr.Markdown("### Submit a Video Query")
|
||||
with gr.Row():
|
||||
video_input = gr.File(label="Upload Video")
|
||||
video_question_input = gr.Textbox(label="Your Question", placeholder="Type your question here...")
|
||||
video_button = gr.Button("Submit")
|
||||
video_output = gr.Textbox(label="Response", interactive=False)
|
||||
video_button.click(
|
||||
fn=lambda vid, q: asyncio.run(video_query(vid, q)),
|
||||
inputs=[video_input, video_question_input],
|
||||
outputs=[video_output]
|
||||
)
|
||||
|
||||
return ui
|
||||
|
||||
if __name__ == "__main__":
|
||||
ui = setup_ui()
|
||||
ui.launch(server_name="0.0.0.0", server_port=7860)
|
BIN
utils/__pycache__/audio_transcription.cpython-312.pyc
Normal file
BIN
utils/__pycache__/audio_transcription.cpython-312.pyc
Normal file
Binary file not shown.
BIN
utils/__pycache__/image_processing.cpython-312.pyc
Normal file
BIN
utils/__pycache__/image_processing.cpython-312.pyc
Normal file
Binary file not shown.
BIN
utils/__pycache__/video_processing.cpython-312.pyc
Normal file
BIN
utils/__pycache__/video_processing.cpython-312.pyc
Normal file
Binary file not shown.
13
utils/audio_transcription.py
Normal file
13
utils/audio_transcription.py
Normal file
@ -0,0 +1,13 @@
|
||||
from pydub import AudioSegment
|
||||
from whisper import load_model
|
||||
|
||||
def extract_audio_from_video(video_path: str) -> str:
|
||||
audio = AudioSegment.from_file(video_path)
|
||||
audio_path = "/tmp/temp_audio.wav"
|
||||
audio.export(audio_path, format="wav")
|
||||
return audio_path
|
||||
|
||||
def transcribe_audio(audio_path: str) -> str:
|
||||
model = load_model("base")
|
||||
result = model.transcribe(audio_path)
|
||||
return result["text"]
|
12
utils/image_processing.py
Normal file
12
utils/image_processing.py
Normal file
@ -0,0 +1,12 @@
|
||||
import io
|
||||
import base64
|
||||
from PIL import Image
|
||||
|
||||
def encode_image_base64(image: Image.Image) -> str:
|
||||
"""
|
||||
Encode a PIL Image to a Base64 string.
|
||||
"""
|
||||
buffered = io.BytesIO()
|
||||
image.save(buffered, format="JPEG")
|
||||
return base64.b64encode(buffered.getvalue()).decode()
|
||||
|
93
utils/video_processing.py
Normal file
93
utils/video_processing.py
Normal file
@ -0,0 +1,93 @@
|
||||
import cv2
|
||||
import os
|
||||
import subprocess
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
from pydub import AudioSegment
|
||||
from decord import VideoReader, cpu
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
def split_video_into_segments(video_path, segment_duration=30):
|
||||
"""
|
||||
Splits a video into segments of a specified duration using FFmpeg.
|
||||
"""
|
||||
output_dir = "/tmp/video_segments"
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
# Calculate total duration of the video
|
||||
cap = cv2.VideoCapture(video_path)
|
||||
fps = int(cap.get(cv2.CAP_PROP_FPS))
|
||||
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
||||
total_duration = total_frames / fps
|
||||
cap.release()
|
||||
|
||||
segments = []
|
||||
for start_time in range(0, int(total_duration), segment_duration):
|
||||
segment_file = os.path.join(output_dir, f"segment_{start_time}.mp4")
|
||||
command = [
|
||||
"ffmpeg", "-i", video_path,
|
||||
"-ss", str(start_time),
|
||||
"-t", str(segment_duration),
|
||||
"-c", "copy", segment_file
|
||||
]
|
||||
subprocess.run(command, check=True)
|
||||
segments.append(segment_file)
|
||||
|
||||
return segments
|
||||
|
||||
def extract_motion_key_frames(video_path, max_frames=20, sigma_multiplier=2, frame_interval=1):
|
||||
"""
|
||||
Extracts key frames from a video based on motion intensity.
|
||||
"""
|
||||
def calculate_motion(frame_pair):
|
||||
"""
|
||||
Calculates motion between two consecutive frames using optical flow.
|
||||
"""
|
||||
prev_gray, current_frame = frame_pair
|
||||
current_gray = cv2.cvtColor(current_frame, cv2.COLOR_BGR2GRAY)
|
||||
flow = cv2.calcOpticalFlowFarneback(prev_gray, current_gray, None, 0.5, 3, 15, 3, 5, 1.2, 0)
|
||||
motion = np.sum(flow ** 2)
|
||||
return motion, current_gray
|
||||
|
||||
# Load video frames using Decord
|
||||
video = VideoReader(video_path, ctx=cpu(0))
|
||||
frames_batch = video.get_batch(range(0, len(video), frame_interval)).asnumpy()
|
||||
|
||||
# Resize frames for faster processing
|
||||
frames = [cv2.resize(frame, (frame.shape[1] // 2, frame.shape[0] // 2)) for frame in frames_batch]
|
||||
|
||||
# Initialize the first frame
|
||||
prev_gray = cv2.cvtColor(frames[0], cv2.COLOR_BGR2GRAY)
|
||||
frame_pairs = [(prev_gray, frames[i]) for i in range(1, len(frames))]
|
||||
|
||||
# Calculate motion statistics
|
||||
motion_values = []
|
||||
with ThreadPoolExecutor() as executor:
|
||||
motion_results = list(executor.map(calculate_motion, frame_pairs))
|
||||
motion_values = [motion for motion, _ in motion_results]
|
||||
|
||||
# Calculate threshold statistically
|
||||
motion_mean = np.mean(motion_values)
|
||||
motion_std = np.std(motion_values)
|
||||
threshold = motion_mean + sigma_multiplier * motion_std
|
||||
|
||||
# Extract key frames based on motion threshold
|
||||
key_frames = []
|
||||
for i, (motion, frame) in enumerate(zip(motion_values, frames[1:])):
|
||||
if motion > threshold and len(key_frames) < max_frames:
|
||||
img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
|
||||
key_frames.append(img)
|
||||
|
||||
return key_frames
|
||||
|
||||
def extract_audio_from_video(video_path):
|
||||
"""
|
||||
Extract audio from video using pydub and save as a temporary audio file.
|
||||
"""
|
||||
print("Audio extraction started...")
|
||||
audio = AudioSegment.from_file(video_path)
|
||||
print("Audio extraction completed.")
|
||||
audio_path = "/tmp/temp_audio.wav"
|
||||
audio.export(audio_path, format="wav")
|
||||
print(f"Audio extracted and saved to: {audio_path}")
|
||||
return audio_path
|
Loading…
Reference in New Issue
Block a user