llm_agent
3
.gitignore
vendored
@ -36,7 +36,8 @@ tracking/data/boxes_imgs/*
|
|||||||
tracking/data/trackfeats/*
|
tracking/data/trackfeats/*
|
||||||
tracking/data/tracks/*
|
tracking/data/tracks/*
|
||||||
tracking/data/handlocal/*
|
tracking/data/handlocal/*
|
||||||
contrast/feat_extract/*
|
contrast/feat_extract/model/__pycache__/*
|
||||||
|
std_img*
|
||||||
.gitignore
|
.gitignore
|
||||||
*/__pycache__/*
|
*/__pycache__/*
|
||||||
ckpts/*
|
ckpts/*
|
||||||
|
@ -1,15 +1,18 @@
|
|||||||
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
|
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
|
||||||
|
from accelerate import init_empty_weights, load_checkpoint_in_model
|
||||||
from stream_pipeline import stream_pipeline
|
from stream_pipeline import stream_pipeline
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
import torch
|
import torch
|
||||||
import ast
|
import ast
|
||||||
import requests
|
import requests
|
||||||
|
import random
|
||||||
|
|
||||||
# default: Load the model on the available device(s)
|
# default: Load the model on the available device(s)
|
||||||
model = Qwen2VLForConditionalGeneration.from_pretrained(
|
model = Qwen2VLForConditionalGeneration.from_pretrained(
|
||||||
"Qwen/Qwen2-VL-7B-Instruct",
|
"Qwen/Qwen2-VL-7B-Instruct",
|
||||||
torch_dtype="auto",
|
torch_dtype=torch.bfloat16,
|
||||||
|
attn_implementation="flash_attention_2",
|
||||||
device_map="auto"
|
device_map="auto"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -30,23 +33,31 @@ def qwen_prompt(img_list, messages):
|
|||||||
inputs = inputs.to("cuda")
|
inputs = inputs.to("cuda")
|
||||||
|
|
||||||
# Inference: Generation of the output
|
# Inference: Generation of the output
|
||||||
generated_ids = model.generate(**inputs, max_new_tokens=256)
|
with torch.no_grad():
|
||||||
|
generated_ids = model.generate(**inputs, max_new_tokens=256)
|
||||||
generated_ids_trimmed = [
|
generated_ids_trimmed = [
|
||||||
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
|
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
|
||||||
]
|
]
|
||||||
output_text = processor.batch_decode(
|
output_text = processor.batch_decode(
|
||||||
generated_ids_trimmed, add_special_tokens=False, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
generated_ids_trimmed, add_special_tokens=False, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
||||||
)
|
)
|
||||||
|
del inputs
|
||||||
|
del generated_ids
|
||||||
|
del generated_ids_trimmed
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
return output_text[0]
|
return output_text[0]
|
||||||
|
|
||||||
|
|
||||||
def get_best_image(track_imgs):
|
def get_best_image(track_imgs):
|
||||||
|
if len(track_imgs) >= 5:
|
||||||
|
track_imgs = random.sample(track_imgs, 5)
|
||||||
img_frames = []
|
img_frames = []
|
||||||
for i in range(len(track_imgs)):
|
for i in range(len(track_imgs)):
|
||||||
content = {}
|
content = {}
|
||||||
content['type'] = 'image'
|
content['type'] = 'image'
|
||||||
content['min_pixels'] = 224 * 224
|
content['min_pixels'] = 224 * 224
|
||||||
content['max_pixels'] = 1280 * 28 * 28
|
content['max_pixels'] = 800 * 800
|
||||||
img_frames.append(content)
|
img_frames.append(content)
|
||||||
|
|
||||||
messages = [
|
messages = [
|
||||||
@ -66,6 +77,8 @@ def get_best_image(track_imgs):
|
|||||||
|
|
||||||
output_text = qwen_prompt(track_imgs, messages)
|
output_text = qwen_prompt(track_imgs, messages)
|
||||||
output_dict = ast.literal_eval(output_text.strip('```python\n'))
|
output_dict = ast.literal_eval(output_text.strip('```python\n'))
|
||||||
|
if output_dict['index'] > len(track_imgs):
|
||||||
|
output_dict['index'] = len(track_imgs)
|
||||||
best_img = track_imgs[output_dict['index'] - 1]
|
best_img = track_imgs[output_dict['index'] - 1]
|
||||||
|
|
||||||
return best_img
|
return best_img
|
||||||
@ -74,42 +87,48 @@ def get_product_description(std_img, track_imgs):
|
|||||||
messages = [
|
messages = [
|
||||||
{
|
{
|
||||||
"role": "system",
|
"role": "system",
|
||||||
"content": "你是一个在超市工作的chatbot,你现在需要提取商品的信息,信息需要按照以下python dict的格式输出: \
|
"content": "你是一个在超市工作的chatbot,你现在需要提取图像中商品的信息,信息需要按照以下python dict的格式输出,如果 \
|
||||||
{\
|
信息模糊不清则输出'未知': \
|
||||||
'Text': 商品中提取出的文字信息, \
|
{ \
|
||||||
'Color': 商品的颜色, \
|
'item1': {\
|
||||||
'Shape': 商品的形状, \
|
'Text': 第一张图像中商品中提取出的文字信息, \
|
||||||
'Material': 商品的材质, \
|
'Color': 第一张图像中商品的颜色, \
|
||||||
'Category': 商品的类别, \
|
'Shape': 第一张图像中商品的形状, \
|
||||||
'is_Same': 如果比对的两件商品的['Text', 'Color', 'Shape', 'Material', 'Category']属性中至少有3个相同则输出True,\
|
'Material': 第一张图像中商品的材质, \
|
||||||
否则输出False, \
|
'Category': 第一张图像中商品的类别, \
|
||||||
|
} \
|
||||||
|
'item2': {\
|
||||||
|
'Text': 第二张图像中商品中提取出的文字信息, \
|
||||||
|
'Color': 第二张图像中商品的颜色, \
|
||||||
|
'Shape': 第二张图像中商品的形状, \
|
||||||
|
'Material': 第二张图像中商品的材质, \
|
||||||
|
'Category': 第二张图像中商品的类别, \
|
||||||
|
} \
|
||||||
|
'is_Same': 首先判断'Color'是否一致,如果不一致则返回False,如果一致则判断是否以上两个dict的['Text', 'Shape', 'Material', 'Category']key中至少有3个相同则输出True,\
|
||||||
|
否则输出False。 \
|
||||||
} \
|
} \
|
||||||
"
|
"
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"role": "system",
|
|
||||||
"content": [
|
|
||||||
{
|
|
||||||
"type": "image",
|
|
||||||
"min_pixels": 224 * 224,
|
|
||||||
"max_pixels": 1280 * 28 * 28,
|
|
||||||
},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"role": "system",
|
|
||||||
"content": [
|
|
||||||
{
|
|
||||||
"type": "image",
|
|
||||||
"min_pixels": 224 * 224,
|
|
||||||
"max_pixels": 1280 * 28 * 28,
|
|
||||||
},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
"content": "以python dict的形式输出第二张图像的比对信息。"
|
"content": [
|
||||||
}
|
{
|
||||||
|
"type": "image",
|
||||||
|
"min_pixels": 224 * 224,
|
||||||
|
"max_pixels": 800 * 800,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "image",
|
||||||
|
"min_pixels": 224 * 224,
|
||||||
|
"max_pixels": 800 * 800,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
# {
|
||||||
|
# "role": "user",
|
||||||
|
# "content": "以python dict的形式输出第二张图像的比对信息。"
|
||||||
|
# "content": "输出一个list,list的内容包含两张图像提取出的dict信息。"
|
||||||
|
# }
|
||||||
]
|
]
|
||||||
best_img = get_best_image(track_imgs)
|
best_img = get_best_image(track_imgs)
|
||||||
if std_img is not None:
|
if std_img is not None:
|
||||||
@ -124,10 +143,13 @@ def get_product_description(std_img, track_imgs):
|
|||||||
|
|
||||||
def item_analysis(stream_dict):
|
def item_analysis(stream_dict):
|
||||||
track_imgs = stream_pipeline(stream_dict)
|
track_imgs = stream_pipeline(stream_dict)
|
||||||
|
if len(track_imgs) == 0:
|
||||||
|
return {}
|
||||||
std_img = None
|
std_img = None
|
||||||
if stream_dict['goodsPic'] is not None:
|
if stream_dict['goodsPic'] is not None:
|
||||||
response = requests.get(stream_dict['goodsPic'])
|
# response = requests.get(stream_dict['goodsPic'])
|
||||||
std_img = Image.open(BytesIO(response.content))
|
# std_img = Image.open(BytesIO(response.content))
|
||||||
|
std_img = Image.open(stream_dict['goodsPic']).convert("RGB")
|
||||||
description_dict = get_product_description(std_img, track_imgs)
|
description_dict = get_product_description(std_img, track_imgs)
|
||||||
|
|
||||||
return description_dict
|
return description_dict
|
||||||
|
23239
dataPair_test.ipynb
Normal file
BIN
images/20250123160635.jpg
Normal file
After Width: | Height: | Size: 55 KiB |
BIN
images/34414.png
Normal file
After Width: | Height: | Size: 46 KiB |
BIN
images/6917935002150.png
Normal file
After Width: | Height: | Size: 58 KiB |
BIN
images/6917935002150_std.png
Normal file
After Width: | Height: | Size: 416 KiB |
BIN
images/6920584471215.png
Normal file
After Width: | Height: | Size: 35 KiB |
BIN
images/6925819700245.jpg
Normal file
After Width: | Height: | Size: 190 KiB |
BIN
images/6931941252224.png
Normal file
After Width: | Height: | Size: 126 KiB |
BIN
images/6934129300472.png
Normal file
After Width: | Height: | Size: 69 KiB |
BIN
images/6942070231936.jpg
Normal file
After Width: | Height: | Size: 204 KiB |
BIN
images/carton_tw_asw_竹炭深潔_770.png
Normal file
After Width: | Height: | Size: 181 KiB |
BIN
images/image1.png
Normal file
After Width: | Height: | Size: 434 KiB |
BIN
images/output.png
Normal file
After Width: | Height: | Size: 103 KiB |
BIN
images/pair1/20250211100406.jpg
Normal file
After Width: | Height: | Size: 45 KiB |
BIN
images/pair1/6924743915886.jpg
Normal file
After Width: | Height: | Size: 321 KiB |
BIN
images/pair2/6903244682954.jpg
Normal file
After Width: | Height: | Size: 47 KiB |
16326
minicpm.ipynb
Normal file
25
minicpm.py
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
# Load model directly
|
||||||
|
from transformers import AutoModel, AutoTokenizer
|
||||||
|
import torch
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
model = AutoModel.from_pretrained(
|
||||||
|
"openbmb/MiniCPM-o-2_6",
|
||||||
|
trust_remote_code=True,
|
||||||
|
attn_implementation='flash_attention_2',
|
||||||
|
torch_dtype=torch.bfloat16,
|
||||||
|
# device_map="auto"
|
||||||
|
)
|
||||||
|
model = model.eval().cuda()
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-o-2_6', use_fast=True, trust_remote_code=True)
|
||||||
|
|
||||||
|
img1 = Image.open('/home/ieemoo0337/projects/datasets/constrast_pair/8850813311020/8850813311020.jpg')
|
||||||
|
img2 = Image.open('/home/ieemoo0337/projects/datasets/constrast_pair/8850511321499/8850511321499.jpg')
|
||||||
|
|
||||||
|
question = '描述第一张图像的1。'
|
||||||
|
msgs = [{'role': 'user', 'content': [img1, img2, question]}]
|
||||||
|
answer = model.chat(
|
||||||
|
msgs=msgs,
|
||||||
|
tokenizer=tokenizer
|
||||||
|
)
|
||||||
|
print(answer)
|
@ -37,7 +37,8 @@ def get_optimized_bboxes(event_tracks):
|
|||||||
points.append([int(ele[2]), int(ele[3])])
|
points.append([int(ele[2]), int(ele[3])])
|
||||||
labels.append(int(ele[4])) # track_id
|
labels.append(int(ele[4])) # track_id
|
||||||
points = np.array(points)
|
points = np.array(points)
|
||||||
|
if len(points) == 0:
|
||||||
|
return []
|
||||||
partitions, indices = tr.partition(points, progress_bar=False, w_perpendicular=100, w_angular=10)
|
partitions, indices = tr.partition(points, progress_bar=False, w_perpendicular=100, w_angular=10)
|
||||||
|
|
||||||
bboxes_opt = []
|
bboxes_opt = []
|
||||||
@ -98,6 +99,8 @@ def stream_pipeline(stream_dict):
|
|||||||
|
|
||||||
event_tracks, video_frames = get_tracking_info(**parmDict)
|
event_tracks, video_frames = get_tracking_info(**parmDict)
|
||||||
bboxes_opt = get_optimized_bboxes(event_tracks[0][1])
|
bboxes_opt = get_optimized_bboxes(event_tracks[0][1])
|
||||||
|
if len(bboxes_opt) == 0:
|
||||||
|
return []
|
||||||
subimg_dict = save_event_subimgs(video_frames, bboxes_opt)
|
subimg_dict = save_event_subimgs(video_frames, bboxes_opt)
|
||||||
|
|
||||||
sub_images = []
|
sub_images = []
|
||||||
|