🚀 作者 :“码上有前”
🚀 文章简介 :Python
🚀 欢迎小伙伴们 点赞👍、收藏⭐、留言💬
python练习题
- 抽取关键词
抽取关键词
import os
import json
import pandas as pd
# 指定文件夹路径和关键词列名
folder_path = './Cosmetic_data/Brand_Classification/brand&details_analysis'
categories_path = './Cosmetic_data/Makeup_Classification/pcommit&details_analysis'
keyword_column = '关键词' # 替换为实际的关键词列名
def extract_keywords(folder_path, keyword_column):
# 存储关键词的列表
keyword_list = []
# 获取文件夹下所有的 csv 文件
csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]
# 遍历每个 csv 文件
for csv_file in csv_files:
file_path = os.path.join(folder_path, csv_file)
# 读取 csv 文件
df = pd.read_csv(file_path)
# 检查关键词列是否存在
if keyword_column in df.columns:
# 获取关键词列的值并去除换行符和制表符
keywords = df[keyword_column].astype(str).str.replace(r'\n|\t', '', regex=True).tolist()
# 将关键词添加到列表中
keyword_list.extend(keywords)
# 去除空字符串
keyword_list = [keyword for keyword in keyword_list if keyword]
return keyword_list
# 提取关键词
# result_keywords = extract_keywords(folder_path, keyword_column)
# 打印结果
# print("提取的关键词列表:")
# print(result_keywords)
def extract_keywords_from_json(categories_path, keyword_key):
keyword_list = []
json_files = [file for file in os.listdir(categories_path) if file.endswith('.json')]
for json_file in json_files:
file_path = os.path.join(categories_path, json_file)
with open(file_path, 'r',encoding="utf-8") as f:
data = json.load(f)
for item in data:
if keyword_key in item:
keywords = item[keyword_key]
if isinstance(keywords, str):
# 替换关键词中的换行符和制表符
keywords = keywords.replace('\n', '').replace('\t', '')
keyword_list.append(keywords)
elif isinstance(keywords, list):
for keyword in keywords:
# 替换关键词中的换行符和制表符
keyword = keyword.replace('\n', '').replace('\t', '')
keyword_list.append(keyword)
keyword_list = [keyword.strip() for keyword in keyword_list if keyword.strip()]
return keyword_list
categories_keywords = extract_keywords_from_json(categories_path, keyword_column)