# 图片预处理部分
def preprocess_batch(images, inputs, dst_width=640, dst_height=640):
batch_input = []
IMs = []
for image in images:
# image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
# image = cv2.resize(image, (imgsz, imgsz), interpolation=cv2.INTER_LINEAR)
IM, img_pre = preprocess_warpAffine(image, dst_width, dst_height)
image_input = ToTensor()(img_pre)[None].cpu().numpy()
batch_input.append(image_input.astype(np.float32))
IMs.append(IM)
batch_input = np.concatenate(batch_input, axis=0)
# print(inputs[0]["shape"])
inputs[0].host = batch_input.ravel()
return IMs
#模型推理部分
def do_inference_v2(context, bindings, inputs, outputs, stream, input_tensor):
# print(len(inputs))
# Transfer input data to the GPU.
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
# Run inference.
context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
# Synchronize the stream
stream.synchronize()
# Return only the host outputs.
return [out.host for out in outputs]
def inference_batch(engine, context, inputs, outputs, bindings, stream, images, batch_size):
start_time = time.time()
IMs = preprocess_batch(images,inputs)
end_time = time.time()
print("s1 "+str(end_time - start_time))
# print(IM)
start_time = time.time()
# for i in range(1000):
outs = do_inference_v2(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream, input_tensor=inputs)
end_time = time.time()
print("s2 "+str(end_time - start_time))
results = []
# print(outs[0].size)
start_time = time.time()
for i in range(batch_size):
# print(outs[3][i:i+1].shape)
result = postprocess([outs[0][i*84*8400:(i+1)*84*8400]], IMs[i], conf_thres=0.25, iou_thres=0.45)
results.append(result)
end_time = time.time()
print("s3 "+str(end_time - start_time))
return results
# 数据前处理部分,添加batch图像的预处理cuda代码,只写改变部分,其余参考前面文章
void preprocess_batch(const std::vector<:mat>& imgBatch, float* dstDevData, const int dstHeight, const int dstWidth, cudaStream_t stream)
{
if(imgBatch.size() == 0){
return;
}
int dstElements = dstHeight * dstWidth * 3;
for(int i=0;i
int srcHeight = imgBatch[i].rows;
int srcWidth = imgBatch[i].cols;
int srcElements = srcHeight * srcWidth * 3;
// middle image data on device ( for bilinear resize )
uchar* midDevData;
cudaMalloc((void**)&midDevData, sizeof(uchar) * dstElements);
// source images data on device
uchar* srcDevData;
cudaMalloc((void**)&srcDevData, sizeof(uchar) * srcElements);
cudaMemcpyAsync(srcDevData, imgBatch[i].data, sizeof(uchar) * srcElements, cudaMemcpyHostToDevice, stream);
// calculate width and height after resize
int w, h, x, y;
float r_w = dstWidth / (srcWidth * 1.0);
float r_h = dstHeight / (srcHeight * 1.0);
if (r_h > r_w) {
w = dstWidth;
h = r_w * srcHeight;
x = 0;
y = (dstHeight - h) / 2;
}
else {
w = r_h * srcWidth;
h = dstHeight;
x = (dstWidth - w) / 2;
y = 0;
}
dim3 blockSize(32, 32);
dim3 gridSize((dstWidth + blockSize.x - 1) / blockSize.x, (dstHeight + blockSize.y - 1) / blockSize.y);