From 31a7abea14379c343db41f525bf98b0e4d9ea51e Mon Sep 17 00:00:00 2001 From: Zixiao Wang Date: Thu, 23 Jan 2025 21:50:55 +0800 Subject: [PATCH] updated ui --- .gitignore | 2 + Dockerfile | 31 +++ README.md | 2 +- __pycache__/main.cpython-312.pyc | Bin 0 -> 769 bytes __pycache__/pipeline_setup.cpython-312.pyc | Bin 0 -> 583 bytes endpoints/image.py | 53 ++++ endpoints/text.py | 24 ++ endpoints/video.py | 229 ++++++++++++++++++ main.py | 18 ++ pipeline_setup.py | 19 ++ requirements.txt | 117 +++++++++ ui.py | 63 +++++ .../audio_transcription.cpython-312.pyc | Bin 0 -> 895 bytes .../image_processing.cpython-312.pyc | Bin 0 -> 763 bytes .../video_processing.cpython-312.pyc | Bin 0 -> 5203 bytes utils/audio_transcription.py | 13 + utils/image_processing.py | 12 + utils/video_processing.py | 93 +++++++ 18 files changed, 675 insertions(+), 1 deletion(-) create mode 100644 Dockerfile create mode 100644 __pycache__/main.cpython-312.pyc create mode 100644 __pycache__/pipeline_setup.cpython-312.pyc create mode 100644 endpoints/image.py create mode 100644 endpoints/text.py create mode 100644 endpoints/video.py create mode 100644 main.py create mode 100644 pipeline_setup.py create mode 100644 requirements.txt create mode 100644 ui.py create mode 100644 utils/__pycache__/audio_transcription.cpython-312.pyc create mode 100644 utils/__pycache__/image_processing.cpython-312.pyc create mode 100644 utils/__pycache__/video_processing.cpython-312.pyc create mode 100644 utils/audio_transcription.py create mode 100644 utils/image_processing.py create mode 100644 utils/video_processing.py diff --git a/.gitignore b/.gitignore index 60d7971..f479596 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,5 @@ *.app .snapshots/* + __pycache__/* +endpoints/__pycache__/* \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..9a038e2 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,31 @@ +FROM python:3.12-slim + +WORKDIR /home/ooin/st/app + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + build-essential \ + libssl-dev \ + libffi-dev \ + libxml2-dev \ + libxslt1-dev \ + zlib1g-dev \ + libjpeg-dev \ + libopenblas-dev \ + libopenmpi-dev \ + && rm -rf /var/lib/apt/lists/* + +COPY requirements.txt . + +RUN pip install --upgrade pip + +RUN pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 flash-attn==2.7.2.post1 -f https://download.pytorch.org/whl/torch_stable.html + +RUN pip install --no-cache-dir -r requirements.txt \ + --index-url https://pypi.tuna.tsinghua.edu.cn/simple/ --timeout 100 --retries 5 + +COPY . . + +EXPOSE 80 + +CMD ["uvicorn", "pipeline_UI3+audio:app", "--host", "0.0.0.0", "--port", "80"] \ No newline at end of file diff --git a/README.md b/README.md index cc8efb3..a1df0ab 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,2 @@ -# tiktok_AI +# api diff --git a/__pycache__/main.cpython-312.pyc b/__pycache__/main.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c2dbc978a7f30dd86fa455f7b14b6391ecb31217 GIT binary patch literal 769 zcmaJLU6{r?B>r&_H<*jhs9mF zR<^dX7M7NRrPz3f93u;gmEFR1R?cKMyCR2^FnRCGoA14k&26( zBhg(hiYru}JzcF;8ahAsAGJPL&k+J@5~F(^fHlPw_0V!HkF-emk&a<|3WO5MRSQJZdrv6hcQ!x1ha4f8)hj0?9Vn3pu$MxQB@F-z1 z$E+8>ffFHw-|Nm!5>Y1+ZcdgvdC<+1uD7x2$O+nMpBuNLg!Dpso1GVW`Ps9lVw+%u z&={IyxH^Vw;{Ku8FMfgc5M6&IPVF>EZA$r2B@n8}p;*d+Q&k968by>;NL7)HkY#(f$tw1)vtA3F zQ}_v-5d4Fx{RjL6PTbtRaDYp0mD&?0)(w@I!|a>4dNXh4d%eB{WIX!)Y4}Y6;8$7P zg#8a(4kY*tAb?5)V#QTr)m3B7)f50#SBIKwr~oOb%qrBO0V}WyO<04*+0tos(dEoK zRTs#uBY%mgPW!UT!tVa5U)V*LG%d&oLznW?FySf7R)A zy~Bf--NC!*y@Ldi-qF4vIDNtqBS-t~t@hIwTb(zrh33DX%yl^mKW_stmn(wd*Dqz9 zJi}lnEmVaaQ-~t(n6lXCLgk6DQk150pcf$`mSg{e7x+9Jc`2SspP?U)kXN#ZaF%-^ z%?KAO3p$8sIQCLBxz^f~%qQGSXoSPOrHJLg566f=sVSEm$mwPjLzF}`XPPv}O0Ue2 zNm&1FDU556LNBj%0xso0_|tMmG)4}km^dkSI39C4_Bws%=A_aF&ypueF3fGI&my$L z)}^w%Lb@x{g{COVPq6m`>|InqV`FB?r?xpW&Z~_=ubr%Pv;$6t~xljfchk W2Ss!3s$s5Ht_;gmKk95v_W273Nw59@ literal 0 HcmV?d00001 diff --git a/endpoints/image.py b/endpoints/image.py new file mode 100644 index 0000000..e46c558 --- /dev/null +++ b/endpoints/image.py @@ -0,0 +1,53 @@ +from fastapi import UploadFile, Form +from fastapi.responses import JSONResponse +import io +from PIL import Image +from pipeline_setup import pipe, IMAGE_TOKEN +from utils.image_processing import encode_image_base64 + +async def image_query(file: UploadFile, question: str = Form(...)): + """ + API endpoint to process an image with the user's query. + """ + try: + if file.content_type not in ["image/jpeg", "image/png"]: + return JSONResponse({"query": question, "error": "Unsupported file type."}) + + image_data = await file.read() + image = Image.open(io.BytesIO(image_data)).convert("RGB").resize((512, 512)) + encoded_image_base64 = encode_image_base64(image) + + question_with_image_token = f"{question}\n{IMAGE_TOKEN}" + response = await asyncio.to_thread(pipe, (question, image)) + return JSONResponse({"query": question, "response": response.text}) + except Exception as e: + return JSONResponse({"query": question, "error": str(e)}) + + +# import mimetypes +# async def image_query(file: UploadFile, question: str = Form(...)): +# """ +# API endpoint to process an image with the user's query. +# """ +# try: +# # Get the file path from the UploadFile object +# file_path = file.filename + +# # Determine the file type using the file extension +# file_type, _ = mimetypes.guess_type(file_path) +# if file_type not in ["image/jpeg", "image/png"]: +# return {"query": question, "error": "Unsupported file type."} + +# # Read the image file +# image_data = await file.read() +# image = Image.open(io.BytesIO(image_data)).convert("RGB").resize((512, 512)) +# encoded_image_base64 = encode_image_base64(image) + +# # Prepare the query with the image token +# question_with_image_token = f"{question}\n{IMAGE_TOKEN}" + +# # Query the model +# response = await asyncio.to_thread(pipe, (question, image)) +# return {"query": question, "response": response.text} +# except Exception as e: +# return {"query": question, "error": str(e)} diff --git a/endpoints/text.py b/endpoints/text.py new file mode 100644 index 0000000..f1afc03 --- /dev/null +++ b/endpoints/text.py @@ -0,0 +1,24 @@ +from fastapi import Form +from fastapi.responses import JSONResponse +from asyncio import to_thread +from pipeline_setup import pipe + +async def text_query(question: str = Form(...)): + """ + API endpoint to process text input with the user's query. + """ + try: + response = await to_thread(pipe, question) + return JSONResponse({"query": question, "response": response.text}) + except Exception as e: + return JSONResponse({"query": question, "error": str(e)}) + +# async def text_query(question: str = Form(...)): +# """ +# API endpoint to process text input with the user's query. +# """ +# try: +# response = await to_thread(pipe, question) +# return {"query": question, "response": response.text} +# except Exception as e: +# return {"query": question, "error": str(e)} diff --git a/endpoints/video.py b/endpoints/video.py new file mode 100644 index 0000000..62fbc4a --- /dev/null +++ b/endpoints/video.py @@ -0,0 +1,229 @@ +from fastapi import UploadFile, Form +from fastapi.responses import JSONResponse +from pipeline_setup import pipe +from utils.image_processing import encode_image_base64 +from utils.video_processing import split_video_into_segments, extract_motion_key_frames, extract_audio_from_video +from utils.audio_transcription import transcribe_audio +import asyncio +import time +from concurrent.futures import ThreadPoolExecutor + +async def video_query(file: UploadFile, question: str = Form(...)): + """ + API endpoint to process a video file with the user's query. + """ + try: + print("Processing video...") + + # Validate file type + if file.content_type not in ["video/mp4", "video/avi", "video/mkv"]: + return JSONResponse({"query": question, "error": "Unsupported video file type."}) + + # Start overall timer + overall_start_time = time.time() + + # Save the uploaded video to a temporary file + print("Reading video...") + video_data = await file.read() + temp_video_path = "/tmp/temp_video.mp4" + with open(temp_video_path, "wb") as temp_video_file: + temp_video_file.write(video_data) + print(f"Temp video saved to: {temp_video_path}") + + # Record the time after reading the video + video_reading_time = time.time() + + # Split the video into segments + print("Splitting video...") + segments = split_video_into_segments(temp_video_path, segment_duration=30) + print(f"Video split into {len(segments)} segments.") + + aggregated_responses = [] + segment_timings = [] + + for i, segment_path in enumerate(segments): + print(f"Processing segment {i+1}/{len(segments)}: {segment_path}") + + # Start timing for the segment + segment_start_time = time.time() + + # Extract key frames + frame_start_time = time.time() + imgs = extract_motion_key_frames(segment_path, max_frames=50, sigma_multiplier=2) + frame_time = time.time() + + # Extract audio and transcribe + audio_start_time = time.time() + audio_path = extract_audio_from_video(segment_path) + transcribed_text = transcribe_audio(audio_path) + audio_time = time.time() + + # Combine transcribed text with the query + combined_query = f"Audio Transcript: {transcribed_text}\n{question}" + + # Prepare content for the pipeline + question_with_frames = "" + for j, img in enumerate(imgs): + question_with_frames += f"Frame{j+1}: {{IMAGE_TOKEN}}\n" + question_with_frames += combined_query + + content = [{"type": "text", "text": question_with_frames}] + for img in imgs: + content.append({ + "type": "image_url", + "image_url": { + "max_dynamic_patch": 1, + "url": f"data:image/jpeg;base64,{encode_image_base64(img)}" + } + }) + + # Query the model + inference_start_time = time.time() + messages = [dict(role="user", content=content)] + response = await asyncio.to_thread(pipe, messages) + inference_time = time.time() + + # Aggregate response + aggregated_responses.append(response.text) + + # Calculate timing for the segment + segment_timings.append({ + "segment_index": i + 1, + "segment_processing_time": inference_time - segment_start_time, + "frame_extraction_time": frame_time - frame_start_time, + "audio_extraction_time": audio_time - audio_start_time, + "model_inference_time": inference_time - inference_start_time + }) + + print(f"transcription: {transcribed_text}") + # print(f"content: {content}") + + overall_end_time = time.time() + + # Aggregate total timings + total_timings = { + "video_reading_time": video_reading_time - overall_start_time, + "total_segments": len(segments), + "total_processing_time": overall_end_time - overall_start_time, + "segment_details": segment_timings + } + + return JSONResponse({ + "question": question, + "responses": aggregated_responses, + "timings": total_timings, + }) + except Exception as e: + return JSONResponse({"query": question, "error": str(e)}) + + +# async def video_query(file: UploadFile, question: str = Form(...)): +# """ +# API endpoint to process a video file with the user's query. +# """ +# try: +# print("Processing video...") + +# # Get the file path from the UploadFile object +# file_path = file.filename + +# # Determine the file type using the file extension +# file_type, _ = mimetypes.guess_type(file_path) +# if file_type is None or not file_type.startswith("video/"): +# return {"query": question, "error": "Unsupported video file type."} + +# # Start overall timer +# overall_start_time = time.time() + +# # Save the uploaded video to a temporary file +# print("Reading video...") +# video_data = await file.read() +# temp_video_path = "/tmp/temp_video.mp4" +# with open(temp_video_path, "wb") as temp_video_file: +# temp_video_file.write(video_data) +# print(f"Temp video saved to: {temp_video_path}") + +# # Record the time after reading the video +# video_reading_time = time.time() + +# # Split the video into segments +# print("Splitting video...") +# segments = split_video_into_segments(temp_video_path, segment_duration=30) +# print(f"Video split into {len(segments)} segments.") + +# aggregated_responses = [] +# segment_timings = [] + +# for i, segment_path in enumerate(segments): +# print(f"Processing segment {i+1}/{len(segments)}: {segment_path}") + +# # Start timing for the segment +# segment_start_time = time.time() + +# # Extract key frames +# frame_start_time = time.time() +# imgs = extract_motion_key_frames(segment_path, max_frames=50, sigma_multiplier=2) +# frame_time = time.time() + +# # Extract audio and transcribe +# audio_start_time = time.time() +# audio_path = extract_audio_from_video(segment_path) +# transcribed_text = transcribe_audio(audio_path) +# audio_time = time.time() + +# # Combine transcribed text with the query +# combined_query = f"Audio Transcript: {transcribed_text}\n{question}" + +# # Prepare content for the pipeline +# question_with_frames = "" +# for j, img in enumerate(imgs): +# question_with_frames += f"Frame{j+1}: {{IMAGE_TOKEN}}\n" +# question_with_frames += combined_query + +# content = [{"type": "text", "text": question_with_frames}] +# for img in imgs: +# content.append({ +# "type": "image_url", +# "image_url": { +# "max_dynamic_patch": 1, +# "url": f"data:image/jpeg;base64,{encode_image_base64(img)}" +# } +# }) + +# # Query the model +# inference_start_time = time.time() +# messages = [dict(role="user", content=content)] +# response = await asyncio.to_thread(pipe, messages) +# inference_time = time.time() + +# # Aggregate response +# aggregated_responses.append(response.text) + +# # Calculate timing for the segment +# segment_timings.append({ +# "segment_index": i + 1, +# "segment_processing_time": inference_time - segment_start_time, +# "frame_extraction_time": frame_time - frame_start_time, +# "audio_extraction_time": audio_time - audio_start_time, +# "model_inference_time": inference_time - inference_start_time +# }) + +# print(f"transcription: {transcribed_text}") + +# overall_end_time = time.time() + +# # Aggregate total timings +# total_timings = { +# "video_reading_time": video_reading_time - overall_start_time, +# "total_segments": len(segments), +# "total_processing_time": overall_end_time - overall_start_time, +# "segment_details": segment_timings +# } + +# return { +# "question": question, +# "responses": aggregated_responses, +# "timings": total_timings, +# } +# except Exception as e: +# return {"query": question, "error": str(e)} \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..97d8b51 --- /dev/null +++ b/main.py @@ -0,0 +1,18 @@ +from fastapi import FastAPI +from endpoints.text import text_query +from endpoints.image import image_query +from endpoints.video import video_query + +app = FastAPI() + +# Register routes +app.post("/api/text")(text_query) +app.post("/api/image")(image_query) +app.post("/api/video")(video_query) + +if __name__ == "__main__": + import uvicorn + uvicorn.run("main:app", host="0.0.0.0", port=8080, reload=True) + +# python main.py +# uvicorn main:app --reload \ No newline at end of file diff --git a/pipeline_setup.py b/pipeline_setup.py new file mode 100644 index 0000000..e27eb31 --- /dev/null +++ b/pipeline_setup.py @@ -0,0 +1,19 @@ +from lmdeploy import pipeline, TurbomindEngineConfig, GenerationConfig + +# Constants +IMAGE_TOKEN = "[IMAGE_TOKEN]" + +# Model initialization +model = "OpenGVLab/InternVL2-26B-AWQ" +pipe = pipeline( + model, + backend_config=TurbomindEngineConfig( + model_format="awq", + tp=4, + session_len=12864, + max_batch_size=1, + cache_max_entry_count=0.05, + cache_block_seq_len=32768, + quant_policy=4 + ) +) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..8806a97 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,117 @@ +accelerate==1.2.1 +addict==2.4.0 +aiohappyeyeballs==2.4.4 +aiohttp==3.11.11 +aiosignal==1.3.2 +annotated-types==0.7.0 +anyio==4.7.0 +attrs==24.3.0 +bitsandbytes==0.45.0 +certifi==2024.12.14 +charset-normalizer==3.4.0 +click==8.1.8 +cloudpickle==3.1.0 +datasets==3.2.0 +decord==0.6.0 +dill==0.3.8 +diskcache==5.6.3 +distro==1.9.0 +einops==0.8.0 +fastapi==0.115.6 +filelock==3.16.1 +fire==0.7.0 +# flash-attn==2.7.2.post1 +frozenlist==1.5.0 +fsspec==2024.9.0 +h11==0.14.0 +httpcore==1.0.7 +httpx==0.28.1 +huggingface-hub==0.27.0 +idna==3.10 +interegular==0.3.3 +Jinja2==3.1.5 +jiter==0.8.2 +jsonschema==4.23.0 +jsonschema-specifications==2024.10.1 +lark==1.2.2 +llvmlite==0.43.0 +lmdeploy==0.6.4 +markdown-it-py==3.0.0 +MarkupSafe==3.0.2 +mdurl==0.1.2 +mmengine-lite==0.10.5 +modelscope==1.21.0 +mpmath==1.3.0 +multidict==6.1.0 +multiprocess==0.70.16 +nest-asyncio==1.6.0 +networkx==3.4.2 +ninja==1.11.1.3 +numba==0.60.0 +numpy==1.26.4 +nvidia-cublas-cu12==12.1.3.1 +nvidia-cuda-cupti-cu12==12.1.105 +nvidia-cuda-nvrtc-cu12==12.1.105 +nvidia-cuda-runtime-cu12==12.1.105 +nvidia-cudnn-cu12==9.1.0.70 +nvidia-cufft-cu12==11.0.2.54 +nvidia-curand-cu12==10.3.2.106 +nvidia-cusolver-cu12==11.4.5.107 +nvidia-cusparse-cu12==12.1.0.106 +nvidia-ml-py==12.560.30 +nvidia-nccl-cu12==2.20.5 +nvidia-nvjitlink-cu12==12.6.85 +nvidia-nvtx-cu12==12.1.105 +openai==1.58.1 +opencv-python==4.10.0.84 +outlines==0.0.46 +packaging==24.2 +pandas==2.2.3 +peft==0.11.1 +pillow==11.0.0 +platformdirs==4.3.6 +propcache==0.2.1 +protobuf==5.29.2 +psutil==6.1.1 +pyairports==2.1.1 +pyarrow==18.1.0 +pycountry==24.6.1 +pydantic==2.10.4 +pydantic_core==2.27.2 +Pygments==2.18.0 +pynvml==12.0.0 +python-dateutil==2.9.0.post0 +python-multipart==0.0.20 +pytz==2024.2 +PyYAML==6.0.2 +referencing==0.35.1 +regex==2024.11.6 +requests==2.32.3 +rich==13.9.4 +rpds-py==0.22.3 +safetensors==0.4.5 +sentencepiece==0.2.0 +setuptools==75.6.0 +shortuuid==1.0.13 +six==1.17.0 +sniffio==1.3.1 +starlette==0.41.3 +sympy==1.13.3 +termcolor==2.5.0 +tiktoken==0.8.0 +timm==1.0.12 +tokenizers==0.21.0 +# torch==2.4.0 +# torchaudio==2.4.0 +# torchvision==0.19.0 +tqdm==4.67.1 +transformers==4.47.1 +triton==3.0.0 +typing_extensions==4.12.2 +tzdata==2024.2 +urllib3==2.3.0 +uvicorn==0.34.0 +wheel==0.45.1 +xxhash==3.5.0 +yapf==0.43.0 +yarl==1.18.3 diff --git a/ui.py b/ui.py new file mode 100644 index 0000000..3263e18 --- /dev/null +++ b/ui.py @@ -0,0 +1,63 @@ +import gradio as gr +import asyncio +from endpoints.text import text_query +from endpoints.image import image_query +from endpoints.video import video_query + +def setup_ui(): + with gr.Blocks() as ui: + gr.Markdown( + """ + # Multimodal Query Interface + Submit text, image, or video queries and get insights powered by APIs. + """ + ) + + # Tabbed layout + with gr.Tabs(): + # Text Query Tab + with gr.Tab("Text Query"): + gr.Markdown("### Submit a Text Query") + with gr.Row(): + text_input = gr.Textbox(label="Your Question", placeholder="Type your question here...") + text_button = gr.Button("Submit") + text_output = gr.Textbox(label="Response", interactive=False) + text_button.click( + fn=lambda q: asyncio.run(text_query(q)), + inputs=[text_input], + outputs=[text_output] + ) + + # Image Query Tab + with gr.Tab("Image Query"): + gr.Markdown("### Submit an Image Query") + with gr.Row(): + image_input = gr.File(label="Upload Image") + image_question_input = gr.Textbox(label="Your Question", placeholder="Type your question here...") + image_button = gr.Button("Submit") + image_output = gr.Textbox(label="Response", interactive=False) + image_button.click( + fn=lambda img, q: asyncio.run(image_query(img, q)), + inputs=[image_input, image_question_input], + outputs=[image_output] + ) + + # Video Query Tab + with gr.Tab("Video Query"): + gr.Markdown("### Submit a Video Query") + with gr.Row(): + video_input = gr.File(label="Upload Video") + video_question_input = gr.Textbox(label="Your Question", placeholder="Type your question here...") + video_button = gr.Button("Submit") + video_output = gr.Textbox(label="Response", interactive=False) + video_button.click( + fn=lambda vid, q: asyncio.run(video_query(vid, q)), + inputs=[video_input, video_question_input], + outputs=[video_output] + ) + + return ui + +if __name__ == "__main__": + ui = setup_ui() + ui.launch(server_name="0.0.0.0", server_port=7860) \ No newline at end of file diff --git a/utils/__pycache__/audio_transcription.cpython-312.pyc b/utils/__pycache__/audio_transcription.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0165171116a04c7acb71f3beb6b189631736cc8c GIT binary patch literal 895 zcmZ8f&r2IY6rS0gL=$&gZP2Pz2_A$kG)ti;D}qo9E#k#nE@9(LbmMMznVm#aBBW3# zy-6?jR6JVjq5nrOiUq^ao_gvnke+(#o84r!eeAyXX5Nqe-nZ|oY332g)0OwTA2C8d z)nO!L=nPiDIYAUr>>(eQF;>3jX}(_8!Pgm~gpnGdnKz`Iq1p622Ip*h+{*n@tt>2! z^(J+LZMN&PhJbV43mjVU1Ij#^Z@H8OmCzA;GRv80^2QE8N=3oZ;T@Dgogk6M>ROtU zIvsRAj2~f&kF*Z1XVPHmbTq1c*3XCa70E)4xf&1rO3n3{%(BBU;3}d%B{OlU%*Q4kD^eB3V)4%tdoS>r9RzN}j)X1T zeG%+e)?e67;d+ssR0!@gq8;vr!VMawuq|iVA>8eV1XCscaLgIM3w;$m(IZe@^xc@f zHkK}prPGxwqtxB#O)Ym{_DJsS=IhPl`O}T=<~Q=NH!*!drUy`fxC!bXt`*Rj>u2EH z=9=OgUV&4U^Pk*eUQ_(eK^$1swaS!F<*f*~`>Hoyi%O;3y{ z<9AeS@+LhYf!~K~wZYLcC^h-aaqG%hgqh8I*XF{dx$vQWWm?_my@{#g<#%frWR6cl z>XtPoGhv%HtI2O1JJ@rhknseT9tj?k8u$N%;*tcy_$uIQdTfn9fHrE!qQ{{6I>z`1 hT7;)h2%b4B^bwdZg@H1CL&x){a0TY9@LQR<o}AfaHxK50->IEvN+R)VgO6ida zd8+8!4a`$+1RB9CB(0iwL1MDHD%RwyaOZ#}nx`aZ64IZsGuQ=QD%nm*7dr5Z(x2Ug zfTd%}%fxJsUFhlaC}weu{_G`QE>`9kpLqgIwX?op2-|Pf0tXwmaaeq4M87szAi{!8 z@z$n68F5>UJJC^&N{9OoM0d1$Kzy5-X|A{dPZ!z@)8b=}sC|ODR*i1*Y;|iBN6C4n zf!T@O429@mS>tlm!+ISP>`?i2!}zyhJq>(p1%cbLD6^P*#Db&By`mK|w@IyN z9F=1d)G(#;UB_*{JS|e0YSZFmNu-l~rUl`8;3@;IFTL%&?!4*t4t}ES%hL1Gt7jjz zUg-zA{YzgQ>8k^M_2OWZUmN7thIwnKuaETYfxbP|cfO;YiO2#>jbsIoYr=6=w2R33 zb5*n`s`nX?WvMU9-OQdJIAIgtB{xJUM?!B2H&!8pmmqrya=*dK7-?|#BK&ar{&Wlk L{Y0-ML}~s58lR-K literal 0 HcmV?d00001 diff --git a/utils/__pycache__/video_processing.cpython-312.pyc b/utils/__pycache__/video_processing.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..99900a488ff0a472612bc04fc29c0d048746348c GIT binary patch literal 5203 zcmb7IU2Gf25#Hk+|Bfe867^%tu}oWzCHhB-t;BIGSCM#G8&}l zLuc=}B;zojH{G4OC*INDC^Yxq(`il#6kG!)J=S@kdC7_;=t9VSyjkm4P~-S z#G1=wnSisl>#p~Z==h*~;#s%o6kR$Pinf&u)Lmp%k*#W_%B>!e)7=p^%WB)moY*sj zs+2klx~-|`R(HmBRC)=ey9r+kT}?7bcb-K-Z)TJYKsuX>M>RFT2d1$t2BJw#4ycxe zssT9+9coGvqT#3{2E??&Yf(8FNUPCgBrrIbNJ)`SlTF6or6p2bCX*1Ee-7-~C0&ZD zT1cK!umu{fQeP;;bS6^I8;)=oGs)NvL^JdORRuzWzhE$eoSMa#N0^j^sbH;PlU2i& z;HM-ps;F>VrVkmMd4Kx(l$KT`g9h0RIwENX*WWi58XG?|78)EoYcy19<9(+FLj7l6 zJAKY@DpFkHRmrd`d@>>#Op4bg4JIZ>lLoD7iovSsiz!7GBo$Uu(n-U?r&3Z<40;UK z+NL?9mTZU|j={=lEtS?nASZ~)rwlrrQVmX%H9j5+D||vy4X;%z&mqU6!m=8#Xdx|{ zkPMFS4oBmX;S}UVf=`MnzKU3NrS$KEU6XP`>XJe9E>-K&qEnhY73v%AN^8-$TGB~r zE1rB}T+%cl6|AH)sHe9H_JvXw>INm<_*2#Hx`P_YG?_&q@ zcvsn$0^73GQ)Kt#`q$Z-IpJF5O5|N(wP{bGX;0DHUSvCysxhAc(JhK z#bWc3qW>qk(Y3~%%l(DM_T1?;#))}fKJ}HaE_Y(RY1>lY`|9$`#T|$8&maG?X<&|B zZ{CVoEb0sT^2qIuJ3V*y-xc%Eoh&ww&T(IH4fEpC$QoDoZuflV*S%|8?YnJxkf6RD z$oV}>`#0QnKRf4q%pte`n)ix#er8#`@7lNKs=emB;#=bGyPn1+(N+~iJsziGR?(C0@ zuvJEcZ3KZ9Y*&vlfSli3x4y!cP`XPr=WAuZ@ic@bT(j<~bE?MfGK!LsuzR+&d(c>x z)!n*FXR)MV8_4DX**sYmyaA_`(VLv!3n*Jtg~c_oGV&6s;Vc;4L)Ic1$$CWx{*iVH z{mKh+_^#Cow5bQs@z3ELv%YMt?gJ^@AO(kO+IGS@)=H>KsdMEO>Z(QtDkBJ0HL_l> z(`|Zv25c<@-B~~EwLxnmBeiDNsEmb=6m8 zrwwF|PEElMp|@3QCo9A{2vsSotnq19;jFR;3`0+(Cne1`>J702PYi;p-bmyEe-{CN z*Kii@0|o!AbB0jt3~ECMLFO6gxqvA6U43|ozJO*Zb1a>$&H83g(3u%vG0+ZN(iC0* zPdO#c1}qeZLQVvT=e!8cL=1p)Ovt8>1XPuhYE+xG9HdYN3y=_-Jq|}UzkqBJ>6U@~ z520LDNKy??zy&9AKoGi>S9@xZN9nXki8hf26QD&qn6LAu8AXH1R9&!Gsmw)go!u|qr)}`DcqU!=279}>gVG^I?ij7O3ve=q|HcDGN8@K0YOfxIJS*WOXMuA zkU~yrQGt&K!f|=VIycPhbi`WwtKLT#W*_-*hT$~QZKlm$Gq`w!BaEO!!GLDCglVl` zj?0SS?LRYmW<2!D(DkRTrQo6_sQ-&=G&RR_;f)0ZkR!Mo+h4USaJP6kVDI4u3EIJm~b2LehE3sj4jHIF1@jmUIM zmQ9uFA+W4ce9(C;F2iv3<<2K3;cKBrAu~Pm?#_ZMFz23sX}!6{bdIM!?OyF3D0B}L zn+NB(bzl9eZ(qT;Z)Nzle)rrL-TARs^Jibn|MX}1iP!UQ{33r*DEh=XX1&h8*tF2J z#4J7a!6pTxs=+4sOC;>7Rilvv zA4;U-S`-2xcy)O#J>$!&O!IN&B|OD$#zh8aU2|(shTW_|z-mTt$`L$}iW(k^0X7uX zsIfE~0#0d&@*wuLUT}uD)B*d4wN>SqIXh%{O6}%O*-}N-V8a?b1Jy}6E*fmbE+EED zMARmjY#P=NGRxAo{-|9kA(*-c{_5wT#2os@<+AU7+=ObjEU~xfmA;#fdt6s;V9mc} zv3H^O!xvWSZuQ@`-yFH)Ew&96{U=uaV+H?M(LbIW{ig#>P+_WA6E0kk@?)=E6|Tju z#FiSD8$a51W82EBH(GBWEbM&#w)Fd{Pp0nf{3Q7WRX9AH4~*RRp3Gm6a)bGrFocnH z4U60Yw{(8F@A`#X^lC>>p`)i*do(w^-hJdw-5>k!I{t9-UiXP&{cxfC#0oz@yfnSc zU!VP}KbY@6kvo~MAKq}F#^&6qb@#S=?(NH4*0*k7n*2@tdOW}H=$+S#t;5BwBXizw zX~eev{lL+C4aYuy^M-rbu|mCbe&x;FsY1gskiy@%*uK!dbf8$@kvmClLrt-6Pi|zb zc3Zx+^L9(2^=JXh1&y+TGj6M6%`|q#{r2cvqw_7x z1G&+A%wF8f?Oo|v*`H@$yvH0_^E84rnjRrYJ;J6Jq#xmLBc?l0dT~RY3H1toZ74W| zHtd!IFd>~D8y>a5+it4NIs%6%39=$q`I35w5K6waGn~fHd)4$2=Ihi%F>{X}Bg;FP z4%T%nA&cp_^s@3xSQZ str: + audio = AudioSegment.from_file(video_path) + audio_path = "/tmp/temp_audio.wav" + audio.export(audio_path, format="wav") + return audio_path + +def transcribe_audio(audio_path: str) -> str: + model = load_model("base") + result = model.transcribe(audio_path) + return result["text"] diff --git a/utils/image_processing.py b/utils/image_processing.py new file mode 100644 index 0000000..3ce45e9 --- /dev/null +++ b/utils/image_processing.py @@ -0,0 +1,12 @@ +import io +import base64 +from PIL import Image + +def encode_image_base64(image: Image.Image) -> str: + """ + Encode a PIL Image to a Base64 string. + """ + buffered = io.BytesIO() + image.save(buffered, format="JPEG") + return base64.b64encode(buffered.getvalue()).decode() + diff --git a/utils/video_processing.py b/utils/video_processing.py new file mode 100644 index 0000000..28c18fb --- /dev/null +++ b/utils/video_processing.py @@ -0,0 +1,93 @@ +import cv2 +import os +import subprocess +import numpy as np +from PIL import Image +from pydub import AudioSegment +from decord import VideoReader, cpu +from concurrent.futures import ThreadPoolExecutor + +def split_video_into_segments(video_path, segment_duration=30): + """ + Splits a video into segments of a specified duration using FFmpeg. + """ + output_dir = "/tmp/video_segments" + os.makedirs(output_dir, exist_ok=True) + + # Calculate total duration of the video + cap = cv2.VideoCapture(video_path) + fps = int(cap.get(cv2.CAP_PROP_FPS)) + total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + total_duration = total_frames / fps + cap.release() + + segments = [] + for start_time in range(0, int(total_duration), segment_duration): + segment_file = os.path.join(output_dir, f"segment_{start_time}.mp4") + command = [ + "ffmpeg", "-i", video_path, + "-ss", str(start_time), + "-t", str(segment_duration), + "-c", "copy", segment_file + ] + subprocess.run(command, check=True) + segments.append(segment_file) + + return segments + +def extract_motion_key_frames(video_path, max_frames=20, sigma_multiplier=2, frame_interval=1): + """ + Extracts key frames from a video based on motion intensity. + """ + def calculate_motion(frame_pair): + """ + Calculates motion between two consecutive frames using optical flow. + """ + prev_gray, current_frame = frame_pair + current_gray = cv2.cvtColor(current_frame, cv2.COLOR_BGR2GRAY) + flow = cv2.calcOpticalFlowFarneback(prev_gray, current_gray, None, 0.5, 3, 15, 3, 5, 1.2, 0) + motion = np.sum(flow ** 2) + return motion, current_gray + + # Load video frames using Decord + video = VideoReader(video_path, ctx=cpu(0)) + frames_batch = video.get_batch(range(0, len(video), frame_interval)).asnumpy() + + # Resize frames for faster processing + frames = [cv2.resize(frame, (frame.shape[1] // 2, frame.shape[0] // 2)) for frame in frames_batch] + + # Initialize the first frame + prev_gray = cv2.cvtColor(frames[0], cv2.COLOR_BGR2GRAY) + frame_pairs = [(prev_gray, frames[i]) for i in range(1, len(frames))] + + # Calculate motion statistics + motion_values = [] + with ThreadPoolExecutor() as executor: + motion_results = list(executor.map(calculate_motion, frame_pairs)) + motion_values = [motion for motion, _ in motion_results] + + # Calculate threshold statistically + motion_mean = np.mean(motion_values) + motion_std = np.std(motion_values) + threshold = motion_mean + sigma_multiplier * motion_std + + # Extract key frames based on motion threshold + key_frames = [] + for i, (motion, frame) in enumerate(zip(motion_values, frames[1:])): + if motion > threshold and len(key_frames) < max_frames: + img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) + key_frames.append(img) + + return key_frames + +def extract_audio_from_video(video_path): + """ + Extract audio from video using pydub and save as a temporary audio file. + """ + print("Audio extraction started...") + audio = AudioSegment.from_file(video_path) + print("Audio extraction completed.") + audio_path = "/tmp/temp_audio.wav" + audio.export(audio_path, format="wav") + print(f"Audio extracted and saved to: {audio_path}") + return audio_path