通过fiftyone按分类下载open-images-v7数据集，并转成yolov5可直接训练的格式

import os

import fiftyone as fo
import fiftyone.zoo as foz
import yaml

classes = [
    'Person',  # 人  - 0
    'Car',  # 轿车 - 1
    'Taxi',  # 出租车 - 2
    'Ambulance',  # 救护车 - 3
    'Bus',  # 公共汽车 - 4
    'Bicycle',  # 自行车  - 5
    'Motorcycle',  # 摩托车  - 6
    'Dog',  # 狗  - 7
    'Cat',  # 猫  - 8
    'Mouse',  # 老鼠  - 9
    'Backpack',  # 背包 - 10
    'Glasses',  # 眼镜 - 11
    'Hat',  # 帽子 - 12
    'Helmet',  # 头盔 - 13
    'Traffic light',  # 交通信号灯 - 14
    'Knife',  # 刀 - 15
    'Mobile phone',  # 移动电话  - 16
    'Umbrella'  # 伞 - 17
    # 手持物 - 18
    # 烟雾 - 19
    # 火焰 - 20
]

# 构建类别索引映射
class_to_index = {cls: idx for idx, cls in enumerate(classes)}


def filter_samples_by_label(dataset, target_label):
    # 创建一个新数据集来保存筛选后的样本
    filtered_dataset = fo.Dataset()

    # 遍历原数据集
    for sample in dataset:
        # 保留与target_label匹配的检测框
        filtered_detections = [d for d in sample.ground_truth.detections if d.label == target_label]

        # 如果样本中还有匹配的检测框，创建样本副本并添加到新数据集中
        if filtered_detections:
            new_sample = sample.copy()
            new_sample.ground_truth.detections = filtered_detections
            filtered_dataset.add_sample(new_sample)

    return filtered_dataset


def update_txt_file_class_indices(class_name):
    labels_dir = os.path.join("/yolov5/open-images-v7", class_name, 'labels/val')
    dataset_yaml = os.path.join("/yolov5/open-images-v7", class_name, 'dataset.yaml')
    os.remove(dataset_yaml)
    if os.path.exists(labels_dir):
        for filename in os.listdir(labels_dir):
            if filename.endswith('.txt'):
                filepath = os.path.join(labels_dir, filename)

                # 读取并处理.txt文件
                with open(filepath, 'r') as file:
                    lines = file.readlines()

                # 更新类别索引
                updated_lines = []
                for line in lines:
                    parts = line.strip().split()
                    if len(parts) >= 5:  # 假设每行至少有5个元素（class index在第0个位置）
                        class_idx_str = parts[0]  # 原类别索引（可能是名称也可能是数字）
                        try:
                            updated_class_idx = str(class_to_index[class_name])
                        except ValueError:
                            # 如果转换失败，则按类别名称补全
                            updated_class_idx = class_name
                        parts[0] = updated_class_idx
                        updated_lines.append(' '.join(parts) + '\n')

                # 写回更新后的行
                with open(filepath, 'w') as file:
                    file.writelines(updated_lines)
    print(f"{class_name}类别的所有.txt文件的索引已更新。")


for class_name in classes:
    dataset = foz.load_zoo_dataset(
        "open-images-v7",
        split="train",  # train validation test
        label_types=["detections"],  # "detections", "segmentations", "points"
        classes=[class_name],
        max_samples=100000,  # 筛选前十万个样本
        shuffle=True,
        only_matching=True,  # 指定仅下载符合条件的图片
        num_workers=1,  # 指定进程数为1
        dataset_name="base_100000",
        dataset_dir=f"/open-images-v7"
    )
    filtered_dataset = filter_samples_by_label(dataset, class_name)
    if filtered_dataset.count() == 0:
        print(f"No samples found for class: {class_name}")
        continue
    print(f"{class_name}类别的样本数量为：{filtered_dataset.count()}")
    # 导出这个筛选后的数据集
    result = filtered_dataset.export(
        export_dir=f"/yolov5/open-images-v7/{class_name}/",
        dataset_type=fo.types.YOLOv5Dataset,
        label_field="ground_truth",
    )
    update_txt_file_class_indices(class_name)

data_yaml_content = {
    'train': '/yolov5/open-images-v7',
    'val': '/yolov5/open-images-v7',
    'test': '',  # 可以根据实际情况填写测试集路径
    'nc': len(classes),  # 类别数量
    'names': classes  # 类别名称列表
}
with open('/yolov5/open-images-v7/data.yaml', 'w') as f:
    yaml.dump(data_yaml_content, f, default_flow_style=False)

下载10万张样本 ( 如果测试可以将max_samples值调小)