llm_agent

This commit is contained in:
2025-04-15 09:26:24 +08:00
parent ad850221c5
commit 9400ae904a
25 changed files with 52650 additions and 39 deletions

3
.gitignore vendored
View File

@ -36,7 +36,8 @@ tracking/data/boxes_imgs/*
tracking/data/trackfeats/* tracking/data/trackfeats/*
tracking/data/tracks/* tracking/data/tracks/*
tracking/data/handlocal/* tracking/data/handlocal/*
contrast/feat_extract/* contrast/feat_extract/model/__pycache__/*
std_img*
.gitignore .gitignore
*/__pycache__/* */__pycache__/*
ckpts/* ckpts/*

View File

@ -1,15 +1,18 @@
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from accelerate import init_empty_weights, load_checkpoint_in_model
from stream_pipeline import stream_pipeline from stream_pipeline import stream_pipeline
from PIL import Image from PIL import Image
from io import BytesIO from io import BytesIO
import torch import torch
import ast import ast
import requests import requests
import random
# default: Load the model on the available device(s) # default: Load the model on the available device(s)
model = Qwen2VLForConditionalGeneration.from_pretrained( model = Qwen2VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen2-VL-7B-Instruct", "Qwen/Qwen2-VL-7B-Instruct",
torch_dtype="auto", torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2",
device_map="auto" device_map="auto"
) )
@ -30,6 +33,7 @@ def qwen_prompt(img_list, messages):
inputs = inputs.to("cuda") inputs = inputs.to("cuda")
# Inference: Generation of the output # Inference: Generation of the output
with torch.no_grad():
generated_ids = model.generate(**inputs, max_new_tokens=256) generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [ generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
@ -37,16 +41,23 @@ def qwen_prompt(img_list, messages):
output_text = processor.batch_decode( output_text = processor.batch_decode(
generated_ids_trimmed, add_special_tokens=False, skip_special_tokens=True, clean_up_tokenization_spaces=False generated_ids_trimmed, add_special_tokens=False, skip_special_tokens=True, clean_up_tokenization_spaces=False
) )
del inputs
del generated_ids
del generated_ids_trimmed
torch.cuda.empty_cache()
return output_text[0] return output_text[0]
def get_best_image(track_imgs): def get_best_image(track_imgs):
if len(track_imgs) >= 5:
track_imgs = random.sample(track_imgs, 5)
img_frames = [] img_frames = []
for i in range(len(track_imgs)): for i in range(len(track_imgs)):
content = {} content = {}
content['type'] = 'image' content['type'] = 'image'
content['min_pixels'] = 224 * 224 content['min_pixels'] = 224 * 224
content['max_pixels'] = 1280 * 28 * 28 content['max_pixels'] = 800 * 800
img_frames.append(content) img_frames.append(content)
messages = [ messages = [
@ -66,6 +77,8 @@ def get_best_image(track_imgs):
output_text = qwen_prompt(track_imgs, messages) output_text = qwen_prompt(track_imgs, messages)
output_dict = ast.literal_eval(output_text.strip('```python\n')) output_dict = ast.literal_eval(output_text.strip('```python\n'))
if output_dict['index'] > len(track_imgs):
output_dict['index'] = len(track_imgs)
best_img = track_imgs[output_dict['index'] - 1] best_img = track_imgs[output_dict['index'] - 1]
return best_img return best_img
@ -74,42 +87,48 @@ def get_product_description(std_img, track_imgs):
messages = [ messages = [
{ {
"role": "system", "role": "system",
"content": "你是一个在超市工作的chatbot你现在需要提取商品的信息信息需要按照以下python dict的格式输出: \ "content": "你是一个在超市工作的chatbot你现在需要提取图像中商品的信息信息需要按照以下python dict的格式输出,如果 \
{\ 信息模糊不清则输出'未知': \
'Text': 商品中提取出的文字信息, \ { \
'Color': 商品的颜色, \ 'item1': {\
'Shape': 商品的形状, \ 'Text': 第一张图像中商品中提取出的文字信息, \
'Material': 商品的材质, \ 'Color': 第一张图像中商品的颜色, \
'Category': 商品的类别, \ 'Shape': 第一张图像中商品的形状, \
'is_Same': 如果比对的两件商品的['Text', 'Color', 'Shape', 'Material', 'Category']属性中至少有3个相同则输出True\ 'Material': 第一张图像中商品的材质, \
否则输出False, \ 'Category': 第一张图像中商品的类别, \
} \
'item2': {\
'Text': 第二张图像中商品中提取出的文字信息, \
'Color': 第二张图像中商品的颜色, \
'Shape': 第二张图像中商品的形状, \
'Material': 第二张图像中商品的材质, \
'Category': 第二张图像中商品的类别, \
} \
'is_Same': 首先判断'Color'是否一致如果不一致则返回False如果一致则判断是否以上两个dict的['Text', 'Shape', 'Material', 'Category']key中至少有3个相同则输出True\
否则输出False。 \
} \ } \
" "
}, },
{
"role": "system",
"content": [
{
"type": "image",
"min_pixels": 224 * 224,
"max_pixels": 1280 * 28 * 28,
},
],
},
{
"role": "system",
"content": [
{
"type": "image",
"min_pixels": 224 * 224,
"max_pixels": 1280 * 28 * 28,
},
],
},
{ {
"role": "user", "role": "user",
"content": "以python dict的形式输出第二张图像的比对信息。" "content": [
} {
"type": "image",
"min_pixels": 224 * 224,
"max_pixels": 800 * 800,
},
{
"type": "image",
"min_pixels": 224 * 224,
"max_pixels": 800 * 800,
},
],
},
# {
# "role": "user",
# "content": "以python dict的形式输出第二张图像的比对信息。"
# "content": "输出一个listlist的内容包含两张图像提取出的dict信息。"
# }
] ]
best_img = get_best_image(track_imgs) best_img = get_best_image(track_imgs)
if std_img is not None: if std_img is not None:
@ -124,10 +143,13 @@ def get_product_description(std_img, track_imgs):
def item_analysis(stream_dict): def item_analysis(stream_dict):
track_imgs = stream_pipeline(stream_dict) track_imgs = stream_pipeline(stream_dict)
if len(track_imgs) == 0:
return {}
std_img = None std_img = None
if stream_dict['goodsPic'] is not None: if stream_dict['goodsPic'] is not None:
response = requests.get(stream_dict['goodsPic']) # response = requests.get(stream_dict['goodsPic'])
std_img = Image.open(BytesIO(response.content)) # std_img = Image.open(BytesIO(response.content))
std_img = Image.open(stream_dict['goodsPic']).convert("RGB")
description_dict = get_product_description(std_img, track_imgs) description_dict = get_product_description(std_img, track_imgs)
return description_dict return description_dict

23239
dataPair_test.ipynb Normal file

File diff suppressed because one or more lines are too long

BIN
images/20250123160635.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 55 KiB

BIN
images/34414.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 46 KiB

BIN
images/6917935002150.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 58 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 416 KiB

BIN
images/6920584471215.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 35 KiB

BIN
images/6925819700245.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 190 KiB

BIN
images/6931941252224.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 126 KiB

BIN
images/6934129300472.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 69 KiB

BIN
images/6942070231936.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 204 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 181 KiB

BIN
images/image1.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 434 KiB

BIN
images/output.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 103 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 45 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 321 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 47 KiB

16326
minicpm.ipynb Normal file

File diff suppressed because one or more lines are too long

25
minicpm.py Normal file
View File

@ -0,0 +1,25 @@
# Load model directly
from transformers import AutoModel, AutoTokenizer
import torch
from PIL import Image
model = AutoModel.from_pretrained(
"openbmb/MiniCPM-o-2_6",
trust_remote_code=True,
attn_implementation='flash_attention_2',
torch_dtype=torch.bfloat16,
# device_map="auto"
)
model = model.eval().cuda()
tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-o-2_6', use_fast=True, trust_remote_code=True)
img1 = Image.open('/home/ieemoo0337/projects/datasets/constrast_pair/8850813311020/8850813311020.jpg')
img2 = Image.open('/home/ieemoo0337/projects/datasets/constrast_pair/8850511321499/8850511321499.jpg')
question = '描述第一张图像的1。'
msgs = [{'role': 'user', 'content': [img1, img2, question]}]
answer = model.chat(
msgs=msgs,
tokenizer=tokenizer
)
print(answer)

View File

@ -37,7 +37,8 @@ def get_optimized_bboxes(event_tracks):
points.append([int(ele[2]), int(ele[3])]) points.append([int(ele[2]), int(ele[3])])
labels.append(int(ele[4])) # track_id labels.append(int(ele[4])) # track_id
points = np.array(points) points = np.array(points)
if len(points) == 0:
return []
partitions, indices = tr.partition(points, progress_bar=False, w_perpendicular=100, w_angular=10) partitions, indices = tr.partition(points, progress_bar=False, w_perpendicular=100, w_angular=10)
bboxes_opt = [] bboxes_opt = []
@ -98,6 +99,8 @@ def stream_pipeline(stream_dict):
event_tracks, video_frames = get_tracking_info(**parmDict) event_tracks, video_frames = get_tracking_info(**parmDict)
bboxes_opt = get_optimized_bboxes(event_tracks[0][1]) bboxes_opt = get_optimized_bboxes(event_tracks[0][1])
if len(bboxes_opt) == 0:
return []
subimg_dict = save_event_subimgs(video_frames, bboxes_opt) subimg_dict = save_event_subimgs(video_frames, bboxes_opt)
sub_images = [] sub_images = []

12995
test.ipynb Normal file

File diff suppressed because one or more lines are too long