llm_agent

2025-04-15 09:26:24 +08:00
parent ad850221c5
commit 9400ae904a
25 changed files with 52650 additions and 39 deletions
--- a/.gitignore
+++ b/.gitignore
@ -36,7 +36,8 @@ tracking/data/boxes_imgs/*
 tracking/data/trackfeats/*
 tracking/data/tracks/*
 tracking/data/handlocal/*
-contrast/feat_extract/*
+contrast/feat_extract/model/__pycache__/*
 std_img*
 .gitignore
 */__pycache__/*
 ckpts/*
--- a/Qwen_agent.py
+++ b/Qwen_agent.py
@ -1,15 +1,18 @@
 from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
 from accelerate import init_empty_weights, load_checkpoint_in_model
 from stream_pipeline import stream_pipeline
 from PIL import Image
 from io import BytesIO
 import torch
 import ast
 import requests
 import random
 # default: Load the model on the available device(s)
 model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-7B-Instruct",
-    torch_dtype="auto",
+    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    device_map="auto"
 )
@ -30,23 +33,31 @@ def qwen_prompt(img_list, messages):
    inputs = inputs.to("cuda")
    # Inference: Generation of the output
-    generated_ids = model.generate(**inputs, max_new_tokens=256)
+    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_new_tokens=256)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, add_special_tokens=False, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
-    
+    del inputs
    del generated_ids
    del generated_ids_trimmed
    torch.cuda.empty_cache()
    return output_text[0]
 def get_best_image(track_imgs):
    if len(track_imgs) >= 5:
        track_imgs = random.sample(track_imgs, 5)
    img_frames = []
    for i in range(len(track_imgs)):
        content = {}
        content['type'] = 'image'
        content['min_pixels'] = 224 * 224
-        content['max_pixels'] = 1280 * 28 * 28
+        content['max_pixels'] = 800 * 800
        img_frames.append(content)
    messages = [
@ -66,6 +77,8 @@ def get_best_image(track_imgs):
    output_text = qwen_prompt(track_imgs, messages)
    output_dict = ast.literal_eval(output_text.strip('```python\n'))
    if output_dict['index'] > len(track_imgs):
        output_dict['index'] = len(track_imgs)
    best_img = track_imgs[output_dict['index'] - 1]
    return best_img
@ -74,42 +87,48 @@ def get_product_description(std_img, track_imgs):
    messages = [
        {
            "role": "system",
-            "content": "你是一个在超市工作的chatbot，你现在需要提取商品的信息，信息需要按照以下python dict的格式输出: \
+            "content": "你是一个在超市工作的chatbot，你现在需要提取图像中商品的信息，信息需要按照以下python dict的格式输出，如果 \
-            {\
+            信息模糊不清则输出'未知': \
-                'Text': 商品中提取出的文字信息,  \
+            {    \
-                'Color': 商品的颜色,  \
+                'item1': {\
-                'Shape': 商品的形状,  \
+                    'Text': 第一张图像中商品中提取出的文字信息,  \
-                'Material': 商品的材质,  \
+                    'Color': 第一张图像中商品的颜色,  \
-                'Category': 商品的类别,  \
+                    'Shape': 第一张图像中商品的形状,  \
-                'is_Same': 如果比对的两件商品的['Text', 'Color', 'Shape', 'Material', 'Category']属性中至少有3个相同则输出True，\
+                    'Material': 第一张图像中商品的材质,  \
-                    否则输出False,  \
+                    'Category': 第一张图像中商品的类别,  \
                } \
                'item2': {\
                    'Text': 第二张图像中商品中提取出的文字信息,  \
                    'Color': 第二张图像中商品的颜色,  \
                    'Shape': 第二张图像中商品的形状,  \
                    'Material': 第二张图像中商品的材质,  \
                    'Category': 第二张图像中商品的类别,  \
                } \
                'is_Same': 首先判断'Color'是否一致，如果不一致则返回False，如果一致则判断是否以上两个dict的['Text', 'Shape', 'Material', 'Category']key中至少有3个相同则输出True，\
                    否则输出False。  \
            } \
            "
        },
        {
            "role": "system",
            "content": [
                {
                    "type": "image",
                    "min_pixels": 224 * 224,
                    "max_pixels": 1280 * 28 * 28,
                },
            ],
        },
        {
            "role": "system",
            "content": [
                {
                    "type": "image",
                    "min_pixels": 224 * 224,
                    "max_pixels": 1280 * 28 * 28,
                },
            ],
        },
        {
            "role": "user",
-            "content": "以python dict的形式输出第二张图像的比对信息。"
+            "content": [
-        }
+                {
                    "type": "image",
                    "min_pixels": 224 * 224,
                    "max_pixels": 800 * 800,
                },
                {
                    "type": "image",
                    "min_pixels": 224 * 224,
                    "max_pixels": 800 * 800,
                },
            ],
        },
        # {
            # "role": "user",
            # "content": "以python dict的形式输出第二张图像的比对信息。"
            # "content": "输出一个list，list的内容包含两张图像提取出的dict信息。"
        # }
    ]
    best_img = get_best_image(track_imgs)
    if std_img is not None:
@ -124,10 +143,13 @@ def get_product_description(std_img, track_imgs):
 def item_analysis(stream_dict):
    track_imgs = stream_pipeline(stream_dict)
    if len(track_imgs) == 0:
        return {}
    std_img = None
    if stream_dict['goodsPic'] is not None:
-        response = requests.get(stream_dict['goodsPic'])
+        # response = requests.get(stream_dict['goodsPic'])
-        std_img = Image.open(BytesIO(response.content))
+        # std_img = Image.open(BytesIO(response.content))
        std_img = Image.open(stream_dict['goodsPic']).convert("RGB")
    description_dict = get_product_description(std_img, track_imgs)
    return description_dict
--- a/contrast/feat_extract/model/pycache/init.cpython-310.pyc
+++ b/contrast/feat_extract/model/pycache/init.cpython-310.pyc
--- a/contrast/feat_extract/model/pycache/fmobilenet.cpython-310.pyc
+++ b/contrast/feat_extract/model/pycache/fmobilenet.cpython-310.pyc
--- a/contrast/feat_extract/model/pycache/mobilevit.cpython-310.pyc
+++ b/contrast/feat_extract/model/pycache/mobilevit.cpython-310.pyc
--- a/dataPair_test.ipynb
+++ b/dataPair_test.ipynb
--- a/images/20250123160635.jpg
+++ b/images/20250123160635.jpg
--- a/images/34414.png
+++ b/images/34414.png
--- a/images/6917935002150.png
+++ b/images/6917935002150.png
--- a/images/6917935002150_std.png
+++ b/images/6917935002150_std.png
--- a/images/6920584471215.png
+++ b/images/6920584471215.png
--- a/images/6925819700245.jpg
+++ b/images/6925819700245.jpg
--- a/images/6931941252224.png
+++ b/images/6931941252224.png
--- a/images/6934129300472.png
+++ b/images/6934129300472.png
--- a/images/6942070231936.jpg
+++ b/images/6942070231936.jpg
--- a/images/carton_tw_asw_竹炭深潔_770.png
+++ b/images/carton_tw_asw_竹炭深潔_770.png
--- a/images/image1.png
+++ b/images/image1.png
--- a/images/output.png
+++ b/images/output.png
--- a/images/pair1/20250211100406.jpg
+++ b/images/pair1/20250211100406.jpg
--- a/images/pair1/6924743915886.jpg
+++ b/images/pair1/6924743915886.jpg
--- a/images/pair2/6903244682954.jpg
+++ b/images/pair2/6903244682954.jpg
--- a/minicpm.ipynb
+++ b/minicpm.ipynb
--- a/minicpm.py
+++ b/minicpm.py
@ -0,0 +1,25 @@
 # Load model directly
 from transformers import AutoModel, AutoTokenizer
 import torch
 from PIL import Image
 model = AutoModel.from_pretrained(
    "openbmb/MiniCPM-o-2_6",
    trust_remote_code=True,
    attn_implementation='flash_attention_2',
    torch_dtype=torch.bfloat16,
    # device_map="auto"
 )
 model = model.eval().cuda()
 tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-o-2_6', use_fast=True, trust_remote_code=True)
 img1 = Image.open('/home/ieemoo0337/projects/datasets/constrast_pair/8850813311020/8850813311020.jpg')
 img2 = Image.open('/home/ieemoo0337/projects/datasets/constrast_pair/8850511321499/8850511321499.jpg')
 question = '描述第一张图像的1。'
 msgs = [{'role': 'user', 'content': [img1, img2, question]}]
 answer = model.chat(
    msgs=msgs,
    tokenizer=tokenizer
 )
 print(answer)
--- a/stream_pipeline.py
+++ b/stream_pipeline.py
@ -37,7 +37,8 @@ def get_optimized_bboxes(event_tracks):
            points.append([int(ele[2]), int(ele[3])])
            labels.append(int(ele[4])) # track_id
    points = np.array(points)
-
+    if len(points) == 0:
        return []
    partitions, indices = tr.partition(points, progress_bar=False, w_perpendicular=100, w_angular=10)
    bboxes_opt = []
@ -98,6 +99,8 @@ def stream_pipeline(stream_dict):
    event_tracks, video_frames = get_tracking_info(**parmDict)
    bboxes_opt = get_optimized_bboxes(event_tracks[0][1])
    if len(bboxes_opt) == 0:
        return []
    subimg_dict = save_event_subimgs(video_frames, bboxes_opt)
    sub_images = []
--- a/test.ipynb
+++ b/test.ipynb