From 7fb9de49b73f423a3bb88aefd9b67685ef79bcd3 Mon Sep 17 00:00:00 2001 From: Zixiao Wang Date: Mon, 12 May 2025 11:22:46 +0800 Subject: [PATCH] updated readme --- README.md | 52 +++++++++- __pycache__/main.cpython-311.pyc | Bin 2107 -> 2116 bytes __pycache__/pipeline_setup.cpython-311.pyc | Bin 719 -> 728 bytes __pycache__/tasks.cpython-311.pyc | Bin 8597 -> 8351 bytes celery_debug.py | 40 ++++---- endpoints/image.py | 8 +- endpoints/video.py | 3 +- prompt.txt | 11 +++ tasks.py | 19 +--- test_audio.py | 92 ------------------ .../audio_transcription.cpython-311.pyc | Bin 2917 -> 2926 bytes .../image_processing.cpython-311.pyc | Bin 837 -> 846 bytes .../video_processing.cpython-311.pyc | Bin 7650 -> 7659 bytes 13 files changed, 88 insertions(+), 137 deletions(-) create mode 100644 prompt.txt delete mode 100644 test_audio.py diff --git a/README.md b/README.md index a1df0ab..daf7ae6 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,52 @@ -# api +# API Usage Guide +This project supports multiple methods to start and interact with the API. + +## Method 1: Standard API Start + +Start the API using the main script: + +``` +python main.py +``` + +## Method 2: Gradio UI + +To use the Gradio UI: + +1. In the `endpoints/` directory, change the comment to enable 'gradio'. +2. Run the UI script: + +``` +python ui.py +``` + +## Method 3: Celery Optimization + +Use Celery for optimized background task processing. You can adjust the parameters for concurrency as needed. + +### Step 1: Start Redis Server + +``` +redis-server +``` + +### Step 2: Start Celery Workers + +Open two separate terminals and run the following: + +**Terminal 1 (Preprocessing Queue):** +``` +celery -A tasks worker --pool=threads --loglevel=info --concurrency=2 --queues=preprocess_queue +``` + +**Terminal 2 (Inference Queue):** +``` +celery -A tasks worker --pool=threads --loglevel=info --concurrency=3 --queues=inference_queue +``` + +### Step 3: Run Debug Script + +``` +python celery_debug.py +``` diff --git a/__pycache__/main.cpython-311.pyc b/__pycache__/main.cpython-311.pyc index 8a40201a668f8b5bc2e2038f93958aff1ee2e16d..65bc157354ce05ba9a714e733a388e24aa8be28d 100644 GIT binary patch delta 60 zcmdlja72J-IWI340}xz`Qq0)MbBjq)Lq8)wH&s7BKQm9ixI{lBGrJ@|TQ?*zCpE7` O*E26IfAc>kTUG!q^ApSf delta 51 zcmX>iuv>sLI3~& diff --git a/__pycache__/tasks.cpython-311.pyc b/__pycache__/tasks.cpython-311.pyc index fdc9f6102289325535f0539eea55fdbd9a6a0ff4..0a92ce6951496e9ea94288b5f66e92ac74a75775 100644 GIT binary patch delta 1721 zcmaJ>O>7%g5Z>p%*l}zpvE$hBe+x~V)OM2)1QE2PX(eb;RN4rMjnLNhJ~x}}A8&V| z;XwJx1x}5gDsey}r}WSYRQb|FFTjZtoRHW9C%7PiC>)S-VrE?z7gs!K=gpfp-^`nt z&%U|(`AYm&EH*5_IC6-O+t{RIEOmkbJ2*oyDntB^RqKuTns?~^?Lf+T~OR|v=i$kESyk9ke@yl84agPG?AdF;;k0TG1T@O77*q}jGlPVEK`z;L@~7R0F@ zYDthM22g35^^gI1asZj@AukGY=6{gc0c76mUO_CtKBX45l33EpVp*$*6|b{7l~?@L zQEgV7g>z|F5a)Wi<^;JskgKvQ{HLnpJ^z{@9~tmJI^bWE$t9usROe@ZgYX}M53ajG zlPS9G#;Gow2545@rq^tifzDNZ3@791mJhUY`LNp9G&I^Uz_nr7jdf+)HnwZ)O_l0) z?G?RkH0-9eZPm>kH-Ukqw;alLBs8)-gm?o!>l}#pg~*#94|%E!77|D zca|f65>nx{Xy(EkLZ3!)48&uxysK%vngP)t3XbmR+low${*r27Ry#MN_xxm$uO-qX z$7y1L{Kh{`%#s5CCb5vms#pWzerOUqfya3e)c`xq$41lBXW(E3Ky=>>iW@VT(V~_m zc?sFeXya?6KaxUcX6yj#_-_1o=rovFnv-OjWO*j}{3-P7D=fsi9*B;#6_rZ%j!9i2 zF&sd*!VqYf*X!1eWIA*S3=f3PouuoJK7zAzd~edpzX*3gic_$;Wa}X%SPqVCg>PhM z$b0;JW`3!FhE)*nj;}bfVwBfeQ@2`7G416t!*UPFup2;nRDH0+qnOC=Wre0^$hxvxP8O99y}B&qa{VsA=q z*3_uQuHaP|Gz;i#u@=Uh;a^Rqa@aXHq3CVOsNSLvr>%ii?ffzIE%Ddcd44}H@?xPh z{VL+FB4S8Zm=2-cq2?9KY}%V{SdzT?N-WFe!pRBbwfA#lNPdPaWo!I?A^%bxRb-<_ zL^tc$ifYxHI3~${G$ts)#5yrkH>I8Nj>a(Cn7O;N^C`?br^q znJr=9XHj@mL@qHbr#nuM7g+OQ_yx_79hIJBAHfm-yse*n{FCAa|GoG%ImYjns{HTL Nm!!^bm46K&?jPo-o*n=I delta 1944 zcmah}O>7%Q6yDig8`tqq>m>e9Y=>0EO=3HtDJ?=3Ay8VWm{J5tTdA()orHC~-p%Y9 zC8f|H<(5se5L&IL__s3KSFXO%;I4A zUy*oiPz}(*dbGnO2$fiOrUuoJM!<%_?)*sHhKhGl5KVOF>aJDc8aSVJ1A>yIgO?My zzez!jeAa)Bz$(0*M;)N4IZ+*4mH&~I?q*F3YP2USMpJI4tj2pXhq{@E1vT*>naQ5a zVYlNMB?Dusk=1ibP9IUYN6M$&+sHKb>SM~7KCX;|pIsG{iEgbiLCy8l8tLgpp^Kjo z)JJ>pqdoYdN?sQ7yW8JLMUtwC-LAnXt-ts8!sJ6<8pxbGE1V@~z=I62d?qxbs#LZOf+j>lwL)6R2dEgKY72L#IW<(3(ZdF523HZ7ft~jutV|DwxZT zR8>`$1oUZmEu5?fzsWmp`jbD(sh{N3O*ysJ7dY=^>|D{I-1WC__OxA=*?; z;I^%TO_ywzfnZ)_Lom4W^=&XzkP}8oWwAxsas|t?97HL9!FxhvsrD7GMq(~$c#~Aq ztMinN@|-VP96?f-c@){aj9RUR?GS2tz|Z2-sk76~;y3(*?KgdQ$hqCHT+YkR7Tz~B zl^U#wjCnNJ=Dde(mT;HGaA_P&PX#xB>ro~FwS>q;Fe}2{fY-Bc%~!k$|Lx!XSEcw9 z5dTDU#6roDG<|-fG5l(1iag{$hxU@x`{AiHc7Qz&#*rGd$@ZZ;4JI$M!@L}h?l}UE zC4<>KaL{B%japWvbF5>qc$>c){)MF4bmuifcJn}F+ zWgP@WogH&pgI4TilRBir$_RA(;Ds61-DtJR==S2T0YW70KO&CQx53BC{QGDtJp*f? zj#(HSQr~C+xa>Jzi9JT%vgZ^U0RkkXM`7G+4nj0SZcsTob1!WScLIkyi)Wg1Pu?SYJP-rteATnCLi z9#(CF8jreMs#devew=!%rU{B=r@&cnCx0g~yY2KpB$Ch0A=szGJ%OezHji+(seLeW zd)hJUAB)zUql`9dbp2LQSZ}o_lHU=j#9rior4*0Vvb6?3o*wZYN3sRJl+ME4UP>QI z;&<$-+WpAb1iGkEc7p$zPCq?`Q&gENFeiaTR-xKbHKSs~F|8WYOj|RWuJ)XeshPB) zHEG47cFSb@`H`XYs|O(8i9j1`Ee1Qa3v(^I1r1~ek@!h8uEbCh47cC8DEE5t2jPM5 zDcw+84SI;32baKW$rAs1=%oKwA-LlHt?|rol9z_BlM-JW&L>yncSVuxx(gcHbx#sR Ue>dRc4~M@X2l(fiTQXMo8%*!Se*gdg diff --git a/celery_debug.py b/celery_debug.py index 6050ce0..35a4641 100644 --- a/celery_debug.py +++ b/celery_debug.py @@ -41,7 +41,7 @@ print("Broker:", app.conf.broker_url) # Define the number of concurrent tasks NUM_TASKS = 1 delay_seconds = 0 -file_paths = [f"../video/film4.mp4" for _ in range(NUM_TASKS)] +file_paths = [f"../video/3.mp4" for _ in range(NUM_TASKS)] # video_folder = "../video" @@ -71,29 +71,29 @@ file_paths = [f"../video/film4.mp4" for _ in range(NUM_TASKS)] # for i in range(NUM_TASKS) # ] -# questions = [ -# f"Based on the given images and audio script, extract detailed information about the products recommended in the video and format the output as JSON with the following fields:\n" -# f"- Product Name: \n" -# f"- Category: \n" -# f"- Styles or Variants: \n" -# f"- Highlights: \n" -# f"- Promotional Details: \n" -# f"Do not include any disclaimers or comments like 'I'm sorry' or 'I can't assist.' Task {i}" -# for i in range(NUM_TASKS) -# ] - questions = [ - f"Generate a screenplay based on the given video content and format the output as JSON with the following structured fields:\n" - f"- Scene Descriptions: \n" - f"- Character Introductions: \n" - f"- Dialogue: \n" - f"- Actions & Expressions: \n" - f"- Product Integrations: \n" - f"- Narrative Flow: \n" - f"Follow standard screenplay formatting for headers, character names, dialogue, and actions. Do not include disclaimers or comments like 'I can't assist.' Task {i}" + f"Based on the given images and audio script, extract detailed information about the products recommended in the video and format the output as JSON with the following fields:\n" + f"- Product Name: \n" + f"- Category: \n" + f"- Styles or Variants: \n" + f"- Highlights: \n" + f"- Promotional Details: \n" + f"Do not include any disclaimers or comments like 'I'm sorry' or 'I can't assist.' Task {i}" for i in range(NUM_TASKS) ] +# questions = [ +# f"Generate a screenplay based on the given video content and format the output as JSON with the following structured fields:\n" +# f"- Scene Descriptions: \n" +# f"- Character Introductions: \n" +# f"- Dialogue: \n" +# f"- Actions & Expressions: \n" +# f"- Product Integrations: \n" +# f"- Narrative Flow: \n" +# f"Follow standard screenplay formatting for headers, character names, dialogue, and actions. Do not include disclaimers or comments like 'I can't assist.' Task {i}" +# for i in range(NUM_TASKS) +# ] + # def submit_task(question, index): # sends tasks to Celery asynchronously, queues the tasks in Celery broker. If multiple Celery workers, they process tasks in parallel. # """ Submits a Celery task with increasing delay """ # countdown_time = index * delay_seconds # Dynamic delay diff --git a/endpoints/image.py b/endpoints/image.py index 1f03968..a89a572 100644 --- a/endpoints/image.py +++ b/endpoints/image.py @@ -8,6 +8,7 @@ from PIL import Image from pipeline_setup import pipe, IMAGE_TOKEN from utils.image_processing import encode_image_base64 +# api async def image_query(file: UploadFile, question: str = Form(...)): """ API endpoint to process an image with the user's query. @@ -26,11 +27,8 @@ async def image_query(file: UploadFile, question: str = Form(...)): except Exception as e: return JSONResponse({"query": question, "error": str(e)}) - +# gradio # async def image_query(image: np.ndarray, question: str): -# """ -# API endpoint to process an image (as numpy array) with the user's query. -# """ # try: # # Convert the numpy array to a PIL Image # image = Image.fromarray(image).convert("RGB").resize((512, 512)) @@ -49,7 +47,7 @@ async def image_query(file: UploadFile, question: str = Form(...)): # except Exception as e: # return {"query": question, "error": str(e)} - +# celery # def image_query(image_path: str, question: str): # try: # print("image_path in image_query...") diff --git a/endpoints/video.py b/endpoints/video.py index 7188f8f..85cb103 100644 --- a/endpoints/video.py +++ b/endpoints/video.py @@ -24,6 +24,7 @@ def load_checkpoint(video_id): return json.load(f) return None +# api # async def video_query(file: UploadFile, question: str = Form(...)): # try: # print("Processing video...") @@ -131,7 +132,7 @@ def load_checkpoint(video_id): # except Exception as e: # return JSONResponse({"query": question, "error": str(e)}) - +# gradio # async def video_query(video_path: str, question: str): # """ # API endpoint to process a video file with the user's query. diff --git a/prompt.txt b/prompt.txt new file mode 100644 index 0000000..8165018 --- /dev/null +++ b/prompt.txt @@ -0,0 +1,11 @@ +图片 +"Extract the following information from this image and return the result in JSON format:\n" "- Name: \n" "- ID: \n" "- Profile Picture: \n" "- Follower Count: \n" "- Likes Count: \n" "- Bio: \n" "- Following Count: \n" "- External Links: \n" # "Provide no additional text other than the extracted information." "Do not include any disclaimers or comments like 'I'm sorry' or 'I can't assist'." + + +视频 +“Based on the given images and audio script, extract detailed information about the products recommended in the video and format the output as JSON with the following fields: +1. **Product Name**: The specific name of the product, if mentioned. +2. **Category**: The specific category of the product (e.g., electronics, skincare, casual wear, etc.). +3. **Styles or Variants**: Any styles, designs, or variants of the product described (e.g., colors, patterns, sizes, or other distinguishing attributes). +4. **Highlights**: The unique selling points or notable features emphasized by the anchor (e.g., benefits, quality, or standout features). +5. **Promotional Details**: Any additional promotional information mentioned, such as discounts, offers, or key features that set the product apart.” \ No newline at end of file diff --git a/tasks.py b/tasks.py index 96590dc..a035aa0 100644 --- a/tasks.py +++ b/tasks.py @@ -6,8 +6,6 @@ mp.set_start_method("spawn", force=True) # mp.set_start_method("fork", force=True) from celery import Celery - -import psutil from pynvml import nvmlInit, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex, nvmlDeviceGetUtilizationRates # from endpoints.video import run_video_inference @@ -54,11 +52,6 @@ app.conf.worker_prefetch_multiplier = 1 # include=["tasks"] # ) -# app.conf.task_routes = { -# 'tasks.preprocess_video': {'queue': 'preprocess_queue'}, -# 'tasks.inference_video': {'queue': 'inference_queue'}, -# } - app.conf.task_routes = { 'tasks.preprocess_video': {'queue': 'preprocess_queue'}, 'tasks.inference_video': {'queue': 'inference_queue'}, @@ -93,11 +86,6 @@ app.conf.task_routes = { # celery.conf.task_time_limit = 60 # 60 seconds max execution time # celery.conf.task_soft_time_limit = 50 # Warn at 50 seconds -@app.task -def add(x, y): - print("Adding task...") - return x + y - @app.task(name="tasks.text_query_task") def text_query_task(question: str): print("Importing text_query...") @@ -139,11 +127,6 @@ def video_query_task(file_path: str, question: str): # def video_query_task(preprocessed_data): # return run_video_inference(preprocessed_data) - -# @celery.task(name="tasks.test_task") -# def test_task(): -# return "Celery is working!" - import mimetypes @@ -342,7 +325,7 @@ def preprocess_video(video_path, question): # for preprocessed_data in preprocessed_results: # video_path = preprocessed_data["video_path"] # question = preprocessed_data["question"] -# segments = preprocessed_data["processed_segments"] +# segments = preprocessed_data["processed_data"] # print(f"Inferencing video: {video_path}") diff --git a/test_audio.py b/test_audio.py deleted file mode 100644 index 9ca5572..0000000 --- a/test_audio.py +++ /dev/null @@ -1,92 +0,0 @@ -import torch -import os -from whisper import load_model -from pydub import AudioSegment - -def extract_audio_from_video(video_path: str) -> str: - audio = AudioSegment.from_file(video_path) - audio_path = "/tmp/temp_audio_test.wav" - audio.export(audio_path, format="wav") - print("video extracted!") - return audio_path - -# def transcribe_audio(audio_path: str) -> str: -# print("Loading model in transcribe_audio...") -# from transformers import WhisperProcessor, WhisperForConditionalGeneration -# import torch - -# # Load processor and model from transformers -# processor = WhisperProcessor.from_pretrained("openai/whisper-base") -# model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base") - -# if torch.cuda.device_count() > 1: -# print(f"Using {torch.cuda.device_count()} GPUs!") -# model = torch.nn.DataParallel(model) - -# device = torch.device("cuda" if torch.cuda.is_available() else "cpu") -# model.to(device) -# print("Model loaded successfully.") -# print(audio_path) - -# # Load and process the audio file -# import librosa -# audio_input, sr = librosa.load(audio_path, sr=16000) -# input_features = processor(audio_input, sampling_rate=sr, return_tensors="pt").input_features.to(device) - -# # Generate transcription -# with torch.no_grad(): -# if isinstance(model, torch.nn.DataParallel): -# generated_ids = model.module.generate(input_features) -# else: -# generated_ids = model.generate(input_features) - -# # Decode the generated tokens to text -# transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] -# return transcription - -def transcribe_audio(audio_path: str) -> str: - print("Loading model in transcribe_audio...") - os.environ["CUDA_VISIBLE_DEVICES"] = "0,1" - - from transformers import WhisperProcessor, WhisperForConditionalGeneration - import torch - - # Load processor and model from transformers - processor = WhisperProcessor.from_pretrained("openai/whisper-tiny") - model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny") - - if torch.cuda.device_count() > 1: - print(f"Using {torch.cuda.device_count()} GPUs!") - model = torch.nn.DataParallel(model) - - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - model.to(device) - print("Model loaded successfully.") - print(audio_path) - - # Load and process the audio file - import librosa - audio_input, sr = librosa.load(audio_path, sr=16000) - input_features = processor(audio_input, sampling_rate=sr, return_tensors="pt").input_features.to(device) - - # Generate transcription - with torch.no_grad(): - if isinstance(model, torch.nn.DataParallel): - generated_ids = model.module.generate(input_features) - else: - generated_ids = model.generate(input_features) - - # Decode the generated tokens to text - transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] - return transcription - -if __name__ == "__main__": - extract_audio_from_video("../video/1.mp4") - - audio_file = "/tmp/temp_audio_test.wav" - - for i in range(3): - print(f"\nTranscription attempt {i + 1}:") - transcription = transcribe_audio(audio_file) - print("Transcription:") - print(transcription) \ No newline at end of file diff --git a/utils/__pycache__/audio_transcription.cpython-311.pyc b/utils/__pycache__/audio_transcription.cpython-311.pyc index bfa52b66659479918c81079e537cb53a4e5fdb1b..24f9da0c3e0534d490fbd2e0c8ad73f7e404120e 100644 GIT binary patch delta 60 zcmaDV_D+mvIWI340}xz`Qq0)Mlf|fLtDljdo2s9mpP8p$T%sS6nO%~fts9bzS99zqya`Iwt@$f)o7! delta 51 zcmaDS_Ed~#IWI340}zzIyOzF@CyP