masterUpdate

This commit is contained in:
2022-12-29 13:45:07 +08:00
parent b84a92f67a
commit 615c91feb6
2 changed files with 18 additions and 6 deletions

View File

@ -15,6 +15,7 @@ sys.path.insert(0, ".")
import logging.config import logging.config
from skywalking import agent, config from skywalking import agent, config
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
SW_SERVER = os.environ.get('SW_AGENT_COLLECTOR_BACKEND_SERVICES') SW_SERVER = os.environ.get('SW_AGENT_COLLECTOR_BACKEND_SERVICES')
SW_SERVICE_NAME = os.environ.get('SW_AGENT_NAME') SW_SERVICE_NAME = os.environ.get('SW_AGENT_NAME')
if SW_SERVER and SW_SERVICE_NAME: if SW_SERVER and SW_SERVICE_NAME:
@ -96,7 +97,7 @@ class Predictor(object):
probs = torch.nn.Softmax(dim=-1)(part_logits) probs = torch.nn.Softmax(dim=-1)(part_logits)
topN = torch.argsort(probs, dim=-1, descending=True).tolist() topN = torch.argsort(probs, dim=-1, descending=True).tolist()
clas_ids = topN[0][0] clas_ids = topN[0][0]
clas_ids = 0 if 0==int(clas_ids) or 2 == int(clas_ids) or 3 == int(clas_ids) else 1 #clas_ids = 0 if 0==int(clas_ids) or 2 == int(clas_ids) or 3 == int(clas_ids) else 1
#print("cur_img result: class id: %d, score: %0.3f" % (clas_ids, probs[0, clas_ids].item())) #print("cur_img result: class id: %d, score: %0.3f" % (clas_ids, probs[0, clas_ids].item()))
result["success"] = "true" result["success"] = "true"
result["rst_cls"] = str(clas_ids) result["rst_cls"] = str(clas_ids)

View File

@ -63,6 +63,15 @@ def save_model(args, model):
torch.save(checkpoint, model_checkpoint) torch.save(checkpoint, model_checkpoint)
logger.info("Saved model checkpoint to [DIR: %s]", args.output_dir) logger.info("Saved model checkpoint to [DIR: %s]", args.output_dir)
def save_eve_model(args, model, eve_name):
model_to_save = model.module if hasattr(model, 'module') else model
model_checkpoint = os.path.join(args.output_dir, "%s_checkpoint.bin" % eve_name)
checkpoint = {
'model': model_to_save.state_dict(),
}
torch.save(checkpoint, model_checkpoint)
logger.info("Saved model checkpoint to [DIR: %s]", args.output_dir)
def setup(args): def setup(args):
# Prepare model # Prepare model
@ -265,6 +274,8 @@ def train(args, model):
save_model(args, model) save_model(args, model)
best_acc = accuracy best_acc = accuracy
logger.info("best accuracy so far: %f" % best_acc) logger.info("best accuracy so far: %f" % best_acc)
if int(global_step)%10000 == 0:
save_eve_model(args, model, str(global_step))
model.train() model.train()
if global_step % t_total == 0: if global_step % t_total == 0:
@ -295,7 +306,7 @@ def main():
help="Name of this run. Used for monitoring.") help="Name of this run. Used for monitoring.")
parser.add_argument("--dataset", choices=["CUB_200_2011", "car", "dog", "nabirds", "INat2017", "emptyJudge5", "emptyJudge4"], parser.add_argument("--dataset", choices=["CUB_200_2011", "car", "dog", "nabirds", "INat2017", "emptyJudge5", "emptyJudge4"],
default="CUB_200_2011", help="Which dataset.") default="CUB_200_2011", help="Which dataset.")
parser.add_argument('--data_root', type=str, default='/data/fineGrained') parser.add_argument('--data_root', type=str, default='/data/pfc/fineGrained')
parser.add_argument("--model_type", choices=["ViT-B_16", "ViT-B_32", "ViT-L_16", "ViT-L_32", "ViT-H_14"], parser.add_argument("--model_type", choices=["ViT-B_16", "ViT-B_32", "ViT-L_16", "ViT-L_32", "ViT-H_14"],
default="ViT-B_16",help="Which variant to use.") default="ViT-B_16",help="Which variant to use.")
parser.add_argument("--pretrained_dir", type=str, default="ckpts/ViT-B_16.npz", parser.add_argument("--pretrained_dir", type=str, default="ckpts/ViT-B_16.npz",
@ -309,7 +320,7 @@ def main():
help="Total batch size for training.") help="Total batch size for training.")
parser.add_argument("--eval_batch_size", default=16, type=int, parser.add_argument("--eval_batch_size", default=16, type=int,
help="Total batch size for eval.") help="Total batch size for eval.")
parser.add_argument("--eval_every", default=200, type=int, parser.add_argument("--eval_every", default=200, type=int, #200
help="Run prediction on validation set every so many steps." help="Run prediction on validation set every so many steps."
"Will always run one evaluation at the end of training.") "Will always run one evaluation at the end of training.")
@ -317,7 +328,7 @@ def main():
help="The initial learning rate for SGD.") help="The initial learning rate for SGD.")
parser.add_argument("--weight_decay", default=0, type=float, parser.add_argument("--weight_decay", default=0, type=float,
help="Weight deay if we apply some.") help="Weight deay if we apply some.")
parser.add_argument("--num_steps", default=8000, type=int, #100000 parser.add_argument("--num_steps", default=40000, type=int, #100000
help="Total number of training epochs to perform.") help="Total number of training epochs to perform.")
parser.add_argument("--decay_type", choices=["cosine", "linear"], default="cosine", parser.add_argument("--decay_type", choices=["cosine", "linear"], default="cosine",
help="How to decay the learning rate.") help="How to decay the learning rate.")
@ -352,9 +363,9 @@ def main():
# Setup CUDA, GPU & distributed training # Setup CUDA, GPU & distributed training
if args.local_rank == -1: if args.local_rank == -1:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('torch.cuda.device_count()>>>>>>>>>>>>>>>>>>>>>>>>>', torch.cuda.device_count()) #print('torch.cuda.device_count()>>>>>>>>>>>>>>>>>>>>>>>>>', torch.cuda.device_count())
args.n_gpu = torch.cuda.device_count() args.n_gpu = torch.cuda.device_count()
print('torch.cuda.device_count()>>>>>>>>>>>>>>>>>>>>>>>>>', torch.cuda.device_count()) #print('torch.cuda.device_count()>>>>>>>>>>>>>>>>>>>>>>>>>', torch.cuda.device_count())
else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
torch.cuda.set_device(args.local_rank) torch.cuda.set_device(args.local_rank)
device = torch.device("cuda", args.local_rank) device = torch.device("cuda", args.local_rank)