安装和简单使用Milvus
1 介绍
Milvus是国产的高性能分布式向量数据库。
# Milvus官网
https://milvus.io/
# 安装文档
https://milvus.io/docs/install-overview.md
# Python的对应关系和接口文档
https://milvus.io/api-reference/pymilvus/v2.4.x/About.md
2 安装Milvus
2.1 安装数据库
# 下载shell脚本
wget https://github.com/milvus-io/milvus/releases/download/v2.4.13/milvus-standalone-docker-compose.yml -O docker-compose.yml
# 执行命令
docker-compose -f milvus-standalone-docker-compose.yml up -d
# 查看容器
docker ps | grep milvus
修改认证权限
# Milvus的配置目录,一般不用
# 下载milvus.yaml文件
# 下载地址
https://raw.githubusercontent.com/milvus-io/milvus/v2.4.13/configs/milvus.yaml
# 修改milvus.yaml文件
...
common:
...
security:
# 修改milvus.yaml中的下面参数为: true
authorizationEnabled: false
...
# 在milvus-standalone-docker-compose.yml中添加共享数据卷
...
# 安装milvus
standalone:
container_name: milvus-standalone
...
volumes:
# Milvus的配置目录,将修改后的文件放在下面即可
- /home/milvus/milvus/configs/milvus.yaml:/milvus/configs/milvus.yaml
...
milvus-standalone-docker-compose.yml
version: '3.5'
services:
# 安装etcd
etcd:
container_name: milvus-etcd
image: quay.io/coreos/etcd:v3.5.5
restart: always
environment:
- ETCD_AUTO_COMPACTION_MODE=revision
- ETCD_AUTO_COMPACTION_RETENTION=1000
- ETCD_QUOTA_BACKEND_BYTES=4294967296
- ETCD_SNAPSHOT_COUNT=50000
volumes:
- /home/milvus/etcd:/etcd
command: etcd -advertise-client-urls=http://127.0.0.1:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd
healthcheck:
test: ["CMD", "etcdctl", "endpoint", "health"]
interval: 30s
timeout: 20s
retries: 3
# 安装minio
minio:
container_name: milvus-minio
image: minio/minio:RELEASE.2023-03-20T20-16-18Z
restart: always
environment:
MINIO_ACCESS_KEY: minioadmin
MINIO_SECRET_KEY: minioadmin
ports:
- "9001:9001"
- "9000:9000"
volumes:
- /home/milvus/minio:/minio_data
command: minio server /minio_data --console-address ":9001"
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
interval: 30s
timeout: 20s
retries: 3
# 安装milvus
standalone:
container_name: milvus-standalone
image: milvusdb/milvus:v2.4.13
restart: always
command: ["milvus", "run", "standalone"]
security_opt:
- seccomp:unconfined
environment:
ETCD_ENDPOINTS: etcd:2379
MINIO_ADDRESS: minio:9000
volumes:
# Milvus的配置目录,可以不配置
# 下载地址:https://raw.githubusercontent.com/milvus-io/milvus/v2.4.13/configs/milvus.yaml
# common:
# security:
# # 修改milvus.yaml中的下面参数为: true
# authorizationEnabled: false
- /home/milvus/milvus/configs/milvus.yaml:/milvus/configs/milvus.yaml
# Milvus的数据目录
- /home/milvus/milvus/data:/var/lib/milvus
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:9091/healthz"]
interval: 30s
start_period: 90s
timeout: 20s
retries: 3
ports:
- "19530:19530"
- "9091:9091"
depends_on:
- "etcd"
- "minio"
networks:
default:
name: milvus
2.2 安装工具
⚠️ 注意:attu和Mivus有版本对应关系。
docker run -itd \
--name milvus-attu \
--restart always \
-p 3000:3000 \
zilliz/attu:v2.4.8
访问attu
# 访问地址
http://192.168.108.160:3000/#/
# 访问成功后输入Milvus的地址和端口号即可
# Mlivus的默认账号/密码:root/Milvus
192.168.108.160:19530
3 使用Milvus
3.1 安装依赖
# 注意有版本对应关系
pip install pymilvus==2.4.8 -i https://pypi.tuna.tsinghua.edu.cn/simple
3.2 简单使用
import json
from pymilvus import MilvusClient, FieldSchema, DataType
from sentence_transformers import SentenceTransformer
db_name = "test_db"
collection_name = "test_collection"
def create_db():
# 连接Milvus
client_tmp = MilvusClient(
uri="http://192.168.108.160:19530",
# 默认的账号和密码
token="root:Milvus",
# 连接默认数据库
db_name="default"
)
# 创建数据库
client_tmp.create_database(db_name)
def create_collection():
# 连接Milvus
client = MilvusClient(
uri="http://192.168.108.160:19530",
# 默认的账号和密码
token="root:Milvus",
db_name=db_name
# db_name="default"
)
# 创建集合
# 判断索引是否存在
if client.has_collection(collection_name=collection_name):
# 删除集合
client.drop_collection(collection_name=collection_name)
# 1 设置schema
schema_config = MilvusClient.create_schema(
auto_id=False,
enable_dynamic_field=True,
)
# 2 设置索引
index_params_config = client.prepare_index_params()
# 设置主键
schema_config.add_field(field_name="id", datatype=DataType.INT64, is_primary=True)
index_params_config.add_index(
field_name="id",
# 索引设置标量
index_type="STL_SORT"
)
# 设置评分类型
schema_config.add_field(field_name="score", datatype=DataType.FLOAT)
# 设置字符类型
schema_config.add_field(field_name="summary", datatype=DataType.VARCHAR, max_length=300)
# 设置向量
schema_config.add_field(field_name="vector", datatype=DataType.FLOAT_VECTOR, dim=384)
index_params_config.add_index(
field_name="vector",
index_type="AUTOINDEX",
metric_type="COSINE"
)
# 创建索引
client.create_collection(
collection_name=collection_name,
# 设置schema
schema=schema_config,
index_params=index_params_config,
# 自动编号
auto_id=True,
# 开启自动动态属性
enable_dynamic_field=True
)
def add_data_vec():
# 连接Milvus
client = MilvusClient(
uri="http://192.168.108.160:19530",
# 默认的账号和密码
token="root:Milvus",
db_name=db_name
)
# 生成向量
model = SentenceTransformer(
model_name_or_path="E:/model/sentencetransformers/all-MiniLM-L6-v2"
)
# 句子列表
sentences = [
"This framework generates embeddings for each input sentence",
"Sentences are passed as a list of string.",
"The quick brown fox jumps over the lazy dog.",
]
# 注意:all-MiniLM-L6-v2的输出维度是384
sentence_embeddings = model.encode(sentences)
# 向量列表
data_list = list()
# 打印嵌入模型
i = 0
for sentence, embedding in zip(sentences, sentence_embeddings):
print("Sentence:", sentence)
print("Embedding:", embedding)
data_item = dict()
data_item["id"] = i
data_item["score"] = 0.1
data_item["summary"] = sentence
data_item["vector"] = embedding.tolist()
print(data_item)
data_list.append(data_item)
# 增加编号
i = i + 1
# 设置索引名称
res = client.insert(collection_name=collection_name, data=data_list)
# 返回值
print(res)
pass
def query_data():
# 连接Milvus
client = MilvusClient(
uri="http://192.168.108.160:19530",
# 默认的账号和密码
token="root:Milvus",
db_name=db_name
)
# 生成向量
model = SentenceTransformer(
model_name_or_path="E:/model/sentencetransformers/all-MiniLM-L6-v2"
)
sentence_embeddings = model.encode("my dog")
res = client.search(
collection_name=collection_name,
# 设置向量
# 例子: data = [ [0.3580376395471989, -0.6023495712049978, ……] ]
data=[sentence_embeddings.tolist()],
# 最大返回值数量
limit=5,
# 设置搜索参数
search_params={"metric_type": "COSINE", "params": {}},
# 设置实体中输出的参数
output_fields=["score", "summary"]
)
print(res)
# 美化输出的缩进量:indent=4
result = json.dumps(res, indent=4)
print(result)
if __name__ == '__main__':
# 1 创建数据库
# create_db()
# 2 创建集合
# create_collection()
# 3 添加向量
# add_data_vec()
# 4 查询数据
query_data()
截图