YOLOv5 Windows环境下的C++部署(GPU)


前言

最近在学习pytorch模型的c++部署,查阅网上资料时发现了很多优秀的博主写的详细的教程,但大部分是以前的版本,所以在此整理记录一下新版的yolov5 c++部署

1、环境介绍

windows环境:

  • win10
  • vs2017
  • libtorch-win-shared-with-deps-1.11.0+cu113
  • opencv4.20
  • cuda11.3
  • cudnn11.3

pytorch 1.11.0、yolov5 v6.0

2、环境配置

  打开VS2017,新建一个控制台应用程序c++_test。
在这里插入图片描述
  由于libtorch只能在64位windows上运行,因此我们需要修改项目为release x64,后面所有的项目配置都按照Release x64来配置,至于调试版的Debug x64可以按照这个教程一样的配置即可。
在这里插入图片描述

  在项目中配置opencv和libtorch,依次选择项目、c++_test属性、VC++目录,包含目录中添加:
  D:\opencv\build\include
  D:\opencv\build\include\opencv2
  D:\libtorch\libtorch-win-shared-with-deps-1.11.0+cu113\libtorch\include
  D:\libtorch\libtorch-win-shared-with-deps-1.11.0+cu113\libtorch\include\torch\csrc\api\include

  库目录中添加:
  D:\opencv\build\x64\vc14\lib
  D:\libtorch\libtorch-win-shared-with-deps-1.11.0+cu113\libtorch\lib

  再依次选择链接器、输入,附加依赖项中添加:
  D:\opencv\build\x64\vc14\lib 文件夹下面的opencv_world420.lib
  D:\libtorch\libtorch-win-shared-with-deps-1.11.0+cu113\libtorch\lib 文件夹下面的所有lib文件。
  注意vs的项目属性配置有release和debug两个版本,由于使用的第三方库是release版本的,所以项目的属性配置也需要是release版的

3、.torchscript.pt版本模型导出

  TorchScript是PyTorch模型(nn.Module的子类)的中间表示,可以在高性能环境(例如C ++,注意不止是c++)中运行,TorchScript可以通过python语言使用和导出。导出代码如下所示(注意导出的是GPU版本,GPU版本可以使用GPU和CPU进行推理,而CPU版本仅支持使用CPU进行推理):

"""Exports a YOLOv5 *.pt model to ONNX and TorchScript formats

Usage:
    $ export PYTHONPATH="$PWD" && python models/export.py --weights ./weights/yolov5s.pt --img 640 --batch 1
"""

import argparse
import sys
import time

sys.path.append('./')  # to run '$ python *.py' files in subdirectories

import torch
import torch.nn as nn

import models
from models.experimental import attempt_load
from utils.activations import Hardswish, SiLU
from utils.general import set_logging, check_img_size

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--weights', type=str, default='./last.pt', help='weights path')  # from yolov5/models/
    parser.add_argument('--img-size', nargs='+', type=int, default=[640, 640], help='image size')  # height, width
    parser.add_argument('--dynamic', action='store_true', help='dynamic ONNX axes')
    parser.add_argument('--batch-size', type=int, default=1, help='batch size')
    opt = parser.parse_args()
    opt.img_size *= 2 if len(opt.img_size) == 1 else 1  # expand
    print(opt)
    set_logging()
    t = time.time()

    # Load PyTorch model
    # gpu
    model = attempt_load(opt.weights, map_location=torch.device('cuda'))  # load FP32 model
    labels = model.names

    # Checks
    gs = int(max(model.stride))  # grid size (max stride)
    opt.img_size = [check_img_size(x, gs) for x in opt.img_size]  # verify img_size are gs-multiples

    # Input
    # gpu
    img = torch.zeros(opt.batch_size, 3, *opt.img_size).to(device='cuda')  # image size(1,3,320,192) iDetection
    model.eval()
    # Update model
    for k, m in model.named_modules():
        m._non_persistent_buffers_set = set()  # pytorch 1.6.0 compatibility
        if isinstance(m, models.common.Conv):  # assign export-friendly activations
            if isinstance(m.act, nn.Hardswish):
                m.act = Hardswish()
            elif isinstance(m.act, nn.SiLU):
                m.act = SiLU()
        # elif isinstance(m, models.yolo.Detect):
        #     m.forward = m.forward_export  # assign forward (optional)
    model.model[-1].export = False  # set Detect() layer export=True
    y = model(img)  # dry run

    # TorchScript export
    try:
        print('\nStarting TorchScript export with torch %s...' % torch.__version__)
        f = opt.weights.replace('.pt', '.GPU_torchscript.pt')  # filename
        ts = torch.jit.trace(model, img)
        ts.save(f)
        print('TorchScript export success, saved as %s' % f)
    except Exception as e:
        print('TorchScript export failure: %s' % e)

    print('\nExport complete (%.2fs). Visualize with https://github.com/lutzroeder/netron.' % (time.time() - t))

4、c++中调用模型并进行推理

  需要准备的文件:上述转换好的.torchscript.pt、coco.names。coco.names是存放标签名称的文件,完整c++代码显示如下

#include <torch/script.h>
#include <memory>
#include <torch/torch.h>
#include<opencv2/opencv.hpp>
#include <iostream>
#include <opencv2/core/core.hpp>  
#include <opencv2/highgui/highgui_c.h>
#include <opencv2/highgui/highgui.hpp>  
#include <opencv2/imgproc/imgproc.hpp>
#include <time.h>  
using namespace cv;


std::vector<std::string> LoadNames(const std::string& path)
{
	// load class names
	std::vector<std::string> class_names;
	std::ifstream infile(path);
	if (infile.is_open()) {
		std::string line;
		while (std::getline(infile, line)) {
			class_names.emplace_back(line);
		}
		infile.close();
	}
	else {
		std::cerr << "Error loading the class names!\n";
	}

	return class_names;
}

std::vector<float> LetterboxImage(const cv::Mat& src, cv::Mat& dst, const cv::Size& out_size)
{
	auto in_h = static_cast<float>(src.rows);
	auto in_w = static_cast<float>(src.cols);
	float out_h = out_size.height;
	float out_w = out_size.width;

	float scale = std::min(out_w / in_w, out_h / in_h);

	int mid_h = static_cast<int>(in_h * scale);
	int mid_w = static_cast<int>(in_w * scale);

	cv::resize(src, dst, cv::Size(mid_w, mid_h));

	int top = (static_cast<int>(out_h) - mid_h) / 2;
	int down = (static_cast<int>(out_h) - mid_h + 1) / 2;
	int left = (static_cast<int>(out_w) - mid_w) / 2;
	int right = (static_cast<int>(out_w) - mid_w + 1) / 2;

	cv::copyMakeBorder(dst, dst, top, down, left, right, cv::BORDER_CONSTANT, cv::Scalar(114, 114, 114));

	std::vector<float> pad_info{ static_cast<float>(left), static_cast<float>(top), scale };
	return pad_info;
}

enum Det
{
	tl_x = 0,
	tl_y = 1,
	br_x = 2,
	br_y = 3,
	score = 4,
	class_idx = 5
};

struct Detection
{
	cv::Rect bbox;
	float score;
	int class_idx;
};

void Tensor2Detection(const at::TensorAccessor<float, 2>& offset_boxes,
	const at::TensorAccessor<float, 2>& det,
	std::vector<cv::Rect>& offset_box_vec,
	std::vector<float>& score_vec)
{

	for (int i = 0; i < offset_boxes.size(0); i++) {
		offset_box_vec.emplace_back(
			cv::Rect(cv::Point(offset_boxes[i][Det::tl_x], offset_boxes[i][Det::tl_y]),
				cv::Point(offset_boxes[i][Det::br_x], offset_boxes[i][Det::br_y]))
		);
		score_vec.emplace_back(det[i][Det::score]);
	}
}

void ScaleCoordinates(std::vector<Detection>& data, float pad_w, float pad_h,
	float scale, const cv::Size& img_shape)
{
	auto clip = [](float n, float lower, float upper)
	{
		return std::max(lower, std::min(n, upper));
	};

	std::vector<Detection> detections;
	for (auto & i : data) {
		float x1 = (i.bbox.tl().x - pad_w) / scale;  // x padding
		float y1 = (i.bbox.tl().y - pad_h) / scale;  // y padding
		float x2 = (i.bbox.br().x - pad_w) / scale;  // x padding
		float y2 = (i.bbox.br().y - pad_h) / scale;  // y padding

		x1 = clip(x1, 0, img_shape.width);
		y1 = clip(y1, 0, img_shape.height);
		x2 = clip(x2, 0, img_shape.width);
		y2 = clip(y2, 0, img_shape.height);

		i.bbox = cv::Rect(cv::Point(x1, y1), cv::Point(x2, y2));
	}
}


torch::Tensor xywh2xyxy(const torch::Tensor& x)
{
	auto y = torch::zeros_like(x);
	// convert bounding box format from (center x, center y, width, height) to (x1, y1, x2, y2)
	y.select(1, Det::tl_x) = x.select(1, 0) - x.select(1, 2).div(2);
	y.select(1, Det::tl_y) = x.select(1, 1) - x.select(1, 3).div(2);
	y.select(1, Det::br_x) = x.select(1, 0) + x.select(1, 2).div(2);
	y.select(1, Det::br_y) = x.select(1, 1) + x.select(1, 3).div(2);
	return y;
}

std::vector<std::vector<Detection>> PostProcessing(const torch::Tensor& detections,
	float pad_w, float pad_h, float scale, const cv::Size& img_shape,
	float conf_thres, float iou_thres)
{
	/***
	 * 结果纬度为batch index(0), top-left x/y (1,2), bottom-right x/y (3,4), score(5), class id(6)
	 * 13*13*3*(1+4)*80
	 */
	constexpr int item_attr_size = 5;
	int batch_size = detections.size(0);
	// number of classes, e.g. 80 for coco dataset
	auto num_classes = detections.size(2) - item_attr_size;

	// get candidates which object confidence > threshold
	auto conf_mask = detections.select(2, 4).ge(conf_thres).unsqueeze(2);

	std::vector<std::vector<Detection>> output;
	output.reserve(batch_size);

	// iterating all images in the batch
	for (int batch_i = 0; batch_i < batch_size; batch_i++) {
		// apply constrains to get filtered detections for current image
		auto det = torch::masked_select(detections[batch_i], conf_mask[batch_i]).view({ -1, num_classes + item_attr_size });

		// if none detections remain then skip and start to process next image
		if (0 == det.size(0)) {
			continue;
		}

		// compute overall score = obj_conf * cls_conf, similar to x[:, 5:] *= x[:, 4:5]
		det.slice(1, item_attr_size, item_attr_size + num_classes) *= det.select(1, 4).unsqueeze(1);

		// box (center x, center y, width, height) to (x1, y1, x2, y2)
		torch::Tensor box = xywh2xyxy(det.slice(1, 0, 4));

		// [best class only] get the max classes score at each result (e.g. elements 5-84)
		std::tuple<torch::Tensor, torch::Tensor> max_classes = torch::max(det.slice(1, item_attr_size, item_attr_size + num_classes), 1);

		// class score
		auto max_conf_score = std::get<0>(max_classes);
		// index
		auto max_conf_index = std::get<1>(max_classes);

		max_conf_score = max_conf_score.to(torch::kFloat).unsqueeze(1);
		max_conf_index = max_conf_index.to(torch::kFloat).unsqueeze(1);

		// shape: n * 6, top-left x/y (0,1), bottom-right x/y (2,3), score(4), class index(5)
		det = torch::cat({ box.slice(1, 0, 4), max_conf_score, max_conf_index }, 1);

		// for batched NMS
		constexpr int max_wh = 4096;
		auto c = det.slice(1, item_attr_size, item_attr_size + 1) * max_wh;
		auto offset_box = det.slice(1, 0, 4) + c;

		std::vector<cv::Rect> offset_box_vec;
		std::vector<float> score_vec;

		// copy data back to cpu
		auto offset_boxes_cpu = offset_box.cpu();
		auto det_cpu = det.cpu();
		const auto& det_cpu_array = det_cpu.accessor<float, 2>();

		// use accessor to access tensor elements efficiently
		Tensor2Detection(offset_boxes_cpu.accessor<float, 2>(), det_cpu_array, offset_box_vec, score_vec);

		// run NMS
		std::vector<int> nms_indices;
		cv::dnn::NMSBoxes(offset_box_vec, score_vec, conf_thres, iou_thres, nms_indices);

		std::vector<Detection> det_vec;
		for (int index : nms_indices) {
			Detection t;
			const auto& b = det_cpu_array[index];
			t.bbox =
				cv::Rect(cv::Point(b[Det::tl_x], b[Det::tl_y]),
					cv::Point(b[Det::br_x], b[Det::br_y]));
			t.score = det_cpu_array[index][Det::score];
			t.class_idx = det_cpu_array[index][Det::class_idx];
			det_vec.emplace_back(t);
		}

		ScaleCoordinates(det_vec, pad_w, pad_h, scale, img_shape);

		// save final detection for the current image
		output.emplace_back(det_vec);
	} // end of batch iterating

	return output;
}

cv::Mat Demo(cv::Mat& img,
	const std::vector<std::vector<Detection>>& detections,
	const std::vector<std::string>& class_names,
	bool label = true)
{
	if (!detections.empty()) {
		for (const auto& detection : detections[0]) {
			const auto& box = detection.bbox;
			float score = detection.score;
			int class_idx = detection.class_idx;

			cv::rectangle(img, box, cv::Scalar(0, 0, 255), 2);

			if (label) {
				std::stringstream ss;
				ss << std::fixed << std::setprecision(2) << score;
				std::string s = class_names[class_idx] + " " + ss.str();

				auto font_face = cv::FONT_HERSHEY_DUPLEX;
				auto font_scale = 1.0;
				int thickness = 1;
				int baseline = 0;
				auto s_size = cv::getTextSize(s, font_face, font_scale, thickness, &baseline);
				cv::rectangle(img,
					cv::Point(box.tl().x, box.tl().y - s_size.height - 5),
					cv::Point(box.tl().x + s_size.width, box.tl().y),
					cv::Scalar(0, 0, 255), -1);
				cv::putText(img, s, cv::Point(box.tl().x, box.tl().y - 5),
					font_face, font_scale, cv::Scalar(255, 255, 255), thickness);
			}
		}
	}
	return img;
	//cv::namedWindow("Result", cv::WINDOW_NORMAL);
	//cv::imshow("Result", img);

}


int main()
{
	torch::DeviceType device_type;
	device_type = torch::kCUDA;
	torch::Device device(device_type);
	torch::jit::script::Module module;
	module = torch::jit::load("D:\\pytorch\\c++_test\\x64\\Debug\\yolov5s.GPU_torchscript.pt", device);  //加载模型
	module.eval();

	std::vector<std::string> class_names = LoadNames("D:\\pytorch\\c++_test\\x64\\Debug\\coco.names");//读取标签
	if (class_names.empty()) {
		return -1;
	}
	// set up threshold
	float conf_thres = 0.4;
	float iou_thres = 0.5;

	VideoCapture video1(0);//打开笔记本自带摄像头(1)为外接摄像头
	video1.set(CAP_PROP_FRAME_WIDTH, 1280);
	video1.set(CAP_PROP_FRAME_HEIGHT, 720);
	//读取视频帧率
	double rate = video1.get(CAP_PROP_FPS);
	std::cout << "rate: " << rate << std::endl;
	//当前视频帧
	Mat frame;
	//每一帧之间的延时
	int delay = 1000 / rate;
	bool stop(false);
	while (!stop)
	{
		double t = (double)cv::getTickCount();//开始计时

		if (!video1.read(frame))
		{
			std::cout << "no video frame" << std::endl;
			break;
		}

		cv::Mat img = frame;
		//inference
		torch::NoGradGuard no_grad;
		cv::Mat img_input = img.clone();
		std::vector<float> pad_info = LetterboxImage(img_input, img_input, cv::Size(640, 640));
		const float pad_w = pad_info[0];
		const float pad_h = pad_info[1];
		const float scale = pad_info[2];
		cv::cvtColor(img_input, img_input, cv::COLOR_BGR2RGB);  // BGR -> RGB
		//归一化需要是浮点类型
		img_input.convertTo(img_input, CV_32FC3, 1.0f / 255.0f);  // normalization 1/255
		// 加载图像到设备
		auto tensor_img = torch::from_blob(img_input.data, { 1, img_input.rows, img_input.cols, img_input.channels() }).to(device_type);
		// BHWC -> BCHW
		tensor_img = tensor_img.permute({ 0, 3, 1, 2 }).contiguous();  // BHWC -> BCHW (Batch, Channel, Height, Width)	
		std::vector<torch::jit::IValue> inputs;
		// 在容器尾部添加一个元素,这个元素原地构造,不需要触发拷贝构造和转移构造
		inputs.emplace_back(tensor_img);	
		//start = clock();
		torch::jit::IValue output = module.forward(inputs);

		// 解析结果
		auto detections = output.toTuple()->elements()[0].toTensor();
		auto result = PostProcessing(detections, pad_w, pad_h, scale, img.size(), conf_thres, iou_thres);

		double endtime = (double)(t_stop - t_start) / CLOCKS_PER_SEC;

		cv::Mat pre_img = Demo(img, result, class_names);
		
		t = ((double)cv::getTickCount() - t) / cv::getTickFrequency();//结束计时
		int fps = int(1.0 / t);//转换为帧率

		std::cout << "FPS: " << fps << std::endl;//输出帧率

		putText(pre_img, ("FPS: " + std::to_string(fps)), Point(0, 50), FONT_HERSHEY_COMPLEX, 0.5, Scalar(0, 0, 0));//输入到帧frame上
		//cv::namedWindow("Result", cv::WINDOW_NORMAL);
		cv::namedWindow("Result", cv::WINDOW_AUTOSIZE);
		cv::imshow("Result", pre_img);

        //waitKey()函数的作用是刷新imshow()展示的图片
		if (waitKey(10) == 27)//27是键盘摁下esc时,计算机接收到的ascii码值
		{
			break;
		}
	}
	video1.release();
	return 0;
}

运行代码结果即为,对摄像头获取到的视频进行目标检测并显示实时帧率。

5、可能遇到的问题

1、‘’std‘’:不明确的符号
解决办法:项目->属性->c/c+±>语言->符合模式->选择否
2、模型加载出错
解决办法:查看cuda是否可用,若torch::cuda::is_available()返回false,则在项目属性(Release)-链接器 - 命令行 - 其他选项贴入下面命令

/INCLUDE:"?ignore_this_library_placeholder@@YAHXZ" 

有些博客提到输入以下命令,实测vs2017 torch::cuda::is_available()虽然返回true,但模型加载依然会失败

/INCLUDE:?warp_size@cuda@at@@YAHXZ

3、coco.names读取失败
  解决办法:注意文件后缀是否为.names。也可以直接进入该链接下载.names文件:https://gitee.com/goodtn/libtorch-yolov5-gpu/tree/master

参考:

环境配置参考:https://qianbin.blog.csdn.net/article/details/102937131
代码参考:https://www.cnblogs.com/tensorrt/p/14614632.html

Logo

为开发者提供学习成长、分享交流、生态实践、资源工具等服务,帮助开发者快速成长。

更多推荐