""" 文本分析处理函数 纯业务逻辑,不涉及文件I/O """ import re from taskflow import get_logger from typing import Dict, List logger = get_logger("examples.text_analysis.processors") def process_read_text(text_content: str) -> Dict: """ 步骤1: 读取并预处理文本 Args: text_content: 文本内容 Returns: 处理后的文本数据 """ logger.info("正在预处理文本...") # 统计基本信息 lines = text_content.split('\n') words = text_content.split() characters = len(text_content) result = { "original_text": text_content, "line_count": len(lines), "word_count": len(words), "character_count": characters, "lines": lines } logger.info(f"文本统计: {len(lines)} 行, {len(words)} 词, {characters} 字符") return result def process_analyze_words(text_data: Dict) -> Dict: """ 步骤2: 分析词频 Args: text_data: 文本数据 Returns: 词频分析结果 """ logger.info("正在分析词频...") text = text_data["original_text"] words = re.findall(r'\b\w+\b', text.lower()) # 统计词频 word_freq = {} for word in words: word_freq[word] = word_freq.get(word, 0) + 1 # 排序 sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True) result = { "word_frequency": dict(sorted_words[:20]), # 前20个最常见的词 "total_unique_words": len(word_freq), "total_words": len(words) } logger.info(f"发现 {len(word_freq)} 个不同的词") return result def process_analyze_sentences(text_data: Dict) -> Dict: """ 步骤3: 分析句子 Args: text_data: 文本数据 Returns: 句子分析结果 """ logger.info("正在分析句子...") text = text_data["original_text"] # 简单的句子分割(基于句号、问号、感叹号) sentences = re.split(r'[.!?]+', text) sentences = [s.strip() for s in sentences if s.strip()] # 统计句子长度 sentence_lengths = [len(s.split()) for s in sentences] result = { "sentence_count": len(sentences), "average_sentence_length": sum(sentence_lengths) / len(sentence_lengths) if sentence_lengths else 0, "max_sentence_length": max(sentence_lengths) if sentence_lengths else 0, "min_sentence_length": min(sentence_lengths) if sentence_lengths else 0 } logger.info(f"发现 {len(sentences)} 个句子") return result def process_generate_report(word_analysis: Dict, sentence_analysis: Dict, text_data: Dict) -> Dict: """ 步骤4: 生成分析报告 Args: word_analysis: 词频分析结果 sentence_analysis: 句子分析结果 text_data: 原始文本数据 Returns: 分析报告 """ logger.info("正在生成分析报告...") report = { "text_statistics": { "total_lines": text_data["line_count"], "total_words": text_data["word_count"], "total_characters": text_data["character_count"] }, "word_analysis": { "total_unique_words": word_analysis["total_unique_words"], "total_words": word_analysis["total_words"], "top_words": word_analysis["word_frequency"] }, "sentence_analysis": { "sentence_count": sentence_analysis["sentence_count"], "average_length": sentence_analysis["average_sentence_length"], "max_length": sentence_analysis["max_sentence_length"], "min_length": sentence_analysis["min_sentence_length"] } } logger.info("分析报告生成完成") return report