qa_content.py 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140
  1. import json
  2. import pandas as pd
  3. def analyze_customer_messages(excel_path):
  4. """分析不满意客户的所有对话记录"""
  5. # 读取Excel文件
  6. df = pd.read_excel(excel_path)
  7. # 1. 找出所有不满意评价的客户
  8. unsatisfied_customers = df[df['分类是否正确'] == '不满意']['客户用户名'].unique()
  9. results = []
  10. # 2. 处理每个客户的对话记录
  11. for customer in unsatisfied_customers:
  12. # 获取该客户的所有对话记录
  13. customer_df = df[df['客户用户名'] == customer].copy()
  14. # 按时间排序,当时间相同时,按对话方向排序(呼入排在呼出前面)
  15. customer_df['信息时间'] = pd.to_datetime(customer_df['信息时间'])
  16. # 创建一个排序键,使得"呼入"优先级高于"呼出"
  17. customer_df['对话方向优先级'] = customer_df['对话方向'].map(lambda x: 0 if x == "呼入" else 1)
  18. customer_df = customer_df.sort_values(['信息时间', '对话方向优先级'])
  19. # 删除临时列
  20. customer_df = customer_df.drop('对话方向优先级', axis=1)
  21. # 构建该客户的消息列表
  22. messages = []
  23. for _, row in customer_df.iterrows():
  24. message = {
  25. "自动回复意图": str(row['自动回复意图']),
  26. "自动回复类型": str(row['自动回复类型']),
  27. "客户用户名": str(row['客户用户名']),
  28. "客服用户名": str(row['客服用户名'].split(':')[-1]),
  29. "信息时间": str(row['信息时间']),
  30. "消息内容": str(row['消息内容']),
  31. "对话方向": str(row['对话方向']),
  32. "分类是否正确": str(row['分类是否正确']),
  33. "机器人自动回复状态": str(row['机器人自动回复状态'])
  34. }
  35. messages.append(message)
  36. # 添加到结果列表
  37. customer_data = {
  38. "用户": customer,
  39. "消息列表": messages
  40. }
  41. results.append(customer_data)
  42. return results
  43. def filter_context_messages(results, context_size=10):
  44. """
  45. 筛选不满意消息的上下文
  46. Args:
  47. results: 原始结果列表
  48. context_size: 前后文数量,默认10条
  49. Returns:
  50. filtered_results: 筛选后的结果列表
  51. """
  52. filtered_results = []
  53. for customer_data in results:
  54. messages = customer_data["消息列表"]
  55. filtered_messages = []
  56. # 找出所有不满意消息的索引
  57. unsatisfied_indices = [
  58. i for i, msg in enumerate(messages)
  59. if msg["分类是否正确"] == "不满意"
  60. ]
  61. # 对每个不满意消息获取上下文
  62. processed_indices = set() # 用于记录已处理的消息索引
  63. for idx in unsatisfied_indices:
  64. # 计算上下文范围
  65. start_idx = max(0, idx - context_size)
  66. end_idx = min(len(messages), idx + context_size + 1)
  67. # 添加范围内的所有消息
  68. for i in range(start_idx, end_idx):
  69. if i not in processed_indices:
  70. filtered_messages.append(messages[i])
  71. processed_indices.add(i)
  72. # 如果有筛选出的消息,则添加到结果中
  73. if filtered_messages:
  74. filtered_results.append({
  75. "用户": customer_data["用户"],
  76. "消息列表": sorted(filtered_messages,
  77. key=lambda x: messages.index(x)) # 保持原有顺序
  78. })
  79. return filtered_results
  80. def format_convert(results):
  81. converted_result = []
  82. for result in results:
  83. messages = result['消息列表']
  84. for i, message in enumerate(messages):
  85. call_intent = message['自动回复意图']
  86. call_type = message['自动回复类型']
  87. call_user = message['客户用户名']
  88. call_servicer = message['客服用户名']
  89. call_direction = message['对话方向']
  90. call_message = message['消息内容']
  91. call_result = message['分类是否正确']
  92. call_statue = message['机器人自动回复状态']
  93. # 完整的处理流程
  94. def process_excel_data(excel_path):
  95. """完整的数据处理流程"""
  96. # 1. 首先获取所有不满意客户的对话记录
  97. results = analyze_customer_messages(excel_path)
  98. # 2. 筛选不满意消息的上下文
  99. filtered_results = filter_context_messages(results)
  100. # return json.dumps(filtered_results, ensure_ascii=False, indent=4)
  101. return filtered_results
  102. # 使用示例
  103. if __name__ == "__main__":
  104. excel_path = "./data/聊天历史统计表_04_02.xlsx"
  105. final_results = process_excel_data(excel_path)
  106. print(final_results)