Unity MiniCPM-V 让引擎拥有视觉

前言
项目
- Python环境布置
- Unity场景布置
- 代码编写
- 添加并设置脚本
- 总结
鸣谢
AI提示

前言

新发布的MiniCPM-V，忍不住玩一下，可以让之前制作的语音助手拥有一定的视觉能力（不是OpenCV不行，而是AI更加符合一个助手所需要的观察力）。这个简单的小项目中我只实现了少量交互代码，大部分全部由ChatGPT 3.5完成，可以在文末链接查看对话记录。

AI视觉识别

项目

Python环境布置

参考官网配置: https://github.com/OpenBMB/MiniCPM-V/tree/main

除了官网的配置还要安装一下flask库

pip install flask

将脚本放到根目录下，并运行

from flask import Flask, request, jsonify
from chat import MiniCPMVChat, img2base64
import torch
import json
import base64
from io import BytesIO
from PIL import Image

# Initialize Flask app
app = Flask(__name__)

# Initialize the chat model
torch.manual_seed(0)
chat_model = MiniCPMVChat('openbmb/MiniCPM-Llama3-V-2_5')

def pil_image_to_base64(img):
    buffer = BytesIO()
    img.save(buffer, format="PNG")
    return base64.b64encode(buffer.getvalue()).decode()

@app.route('/analyze', methods=['POST'])
def analyze_image():
    try:
        data = request.json
        print(f"Received data: {data}")  # Debug: Log the received data
        
        image_base64 = data.get('image')
        question = data.get('question')

        if not image_base64 or not question:
            return jsonify({'error': 'Missing image or question'}), 400

        # Decode base64 image
        image_data = base64.b64decode(image_base64)
        image = Image.open(BytesIO(image_data))
        im_64 = pil_image_to_base64(image)  # Convert PIL image to base64 string

        # Prepare the inputs for the model
        msgs = [{"role": "user", "content": question}]
        inputs = {"image": im_64, "question": json.dumps(msgs)}

        print(f"Inputs for model: {inputs}")  # Debug: Log the inputs for the model

        # Get the answer from the model
        answer = chat_model.chat(inputs)

        # Prepare the response
        response = {
            'answer': answer,
            'context': msgs
        }

        return jsonify(response)

    except Exception as e:
        print(f"Error: {str(e)}")  # Debug: Log any error that occurs
        return jsonify({'error': str(e)}), 500

if __name__ == '__main__':
    app.run(host='127.0.0.1', port=5000)

配置环境并运行Python脚本

Unity场景布置

代码编写

创建并挂载ImageAnalyzer.cs脚本

using System.Collections;
using System.Collections.Generic;
using UnityEngine;
using UnityEngine.Networking;
using System.Text;
using Newtonsoft.Json;
using UnityEngine.UI;

public class Response
{
    [JsonProperty("answer")]
    public string Answer { get; set; }

    [JsonProperty("context")]
    public List<ContextItem> Context { get; set; }
}

public class ContextItem
{
    [JsonProperty("role")]
    public string Role { get; set; }

    [JsonProperty("content")]
    public string Content { get; set; }
}
public class ImageAnalyzer : MonoBehaviour
{
    [SerializeField] private string serverUrl = "http://your_server_ip:5000/analyze";
    [SerializeField] private int webcamWidth = 1280;
    [SerializeField] private int webcamHeight = 720;
    [SerializeField] private int webcamFPS = 30;
    [SerializeField] private WebCamTexture webCamTexture;

    private Texture2D snap;

    public Text tipText;
    public Button clickButton;
    public RawImage rawImage;
    private void Start()
    {
        // Start the webcam
        webCamTexture = new WebCamTexture(webcamWidth, webcamHeight);
        webCamTexture.Play();

        tipText.text = "请点击按钮询问";
        clickButton.interactable = true;
        rawImage.texture = webCamTexture;
    }


    private void Update()
    {
        if (Input.GetKeyDown(KeyCode.Q))
        {
            Debug.Log("按下了按钮");
            StartCoroutine(AnalyzeImageFromWebcam("用简短易懂的语言告诉我这张图上有什么?"));
        }
    }

    public void ClickButtonFunction()
    {
        tipText.text = "请等待。。。";
        clickButton.interactable = false;
        StartCoroutine(AnalyzeImageFromWebcam("用简短易懂的语言告诉我这张图上有什么?"));
    }

    public IEnumerator AnalyzeImageFromWebcam(string question)
    {
        // Wait until the webcam is ready
        yield return new WaitUntil(() => webCamTexture.didUpdateThisFrame);

        // Dispose of the previous snap texture if it exists
        if (snap != null)
        {
            Destroy(snap);
        }

        // Get the current frame from the webcam
        snap = new Texture2D(webCamTexture.width, webCamTexture.height);
        snap.SetPixels(webCamTexture.GetPixels());
        snap.Apply();

        // Convert the image to base64
        string base64Image = ConvertTextureToBase64(snap);

        // Analyze the image
        yield return StartCoroutine(AnalyzeImage(base64Image, question));
    }

    private string ConvertTextureToBase64(Texture2D texture)
    {
        byte[] imageBytes = texture.EncodeToPNG();
        return System.Convert.ToBase64String(imageBytes);
    }
    public IEnumerator AnalyzeImage(string base64Image, string question)
    {
        var formData = new Dictionary<string, string>
        {
            { "image", base64Image },
            { "question", question }
        };

        string jsonData = JsonConvert.SerializeObject(formData);
        byte[] postData = Encoding.UTF8.GetBytes(jsonData);

        UnityWebRequest request = new UnityWebRequest(serverUrl, "POST");
        request.uploadHandler = new UploadHandlerRaw(postData);
        request.downloadHandler = new DownloadHandlerBuffer();
        request.SetRequestHeader("Content-Type", "application/json");

        yield return request.SendWebRequest();

        if (request.result == UnityWebRequest.Result.ConnectionError || request.result == UnityWebRequest.Result.ProtocolError)
        {
            Debug.LogError(request.error);
            tipText.text = request.error;
            clickButton.interactable = true;
        }
        else
        {
            string jsonResponse = request.downloadHandler.text;
            Debug.Log("Response JSON: " + jsonResponse); // Log the response for debugging

            // Process the JSON response here
            // Deserialize the JSON response to the Response object
            Response response = JsonConvert.DeserializeObject<Response>(jsonResponse);

            // Use the response data
            Debug.Log("Answer: " + response.Answer);


            foreach (var contextItem in response.Context)
            {
                Debug.Log($"Role: {contextItem.Role}, Content: {contextItem.Content}");
            }

            tipText.text = response.Answer;
            clickButton.interactable = true;
        }
        request.Dispose();
    }


    void OnDestroy()
    {
        // Stop the webcam
        if (webCamTexture != null)
        {
            webCamTexture.Stop();
            webCamTexture = null;
        }
        // Dispose of the snap texture
        if (snap != null)
        {
            Destroy(snap);
            snap = null;
        }
    }
}