| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141 |
- """
- 文本分析处理函数
- 纯业务逻辑,不涉及文件I/O
- """
- import re
- from taskflow import get_logger
- from typing import Dict, List
- logger = get_logger("examples.text_analysis.processors")
- def process_read_text(text_content: str) -> Dict:
- """
- 步骤1: 读取并预处理文本
-
- Args:
- text_content: 文本内容
-
- Returns:
- 处理后的文本数据
- """
- logger.info("正在预处理文本...")
-
- # 统计基本信息
- lines = text_content.split('\n')
- words = text_content.split()
- characters = len(text_content)
-
- result = {
- "original_text": text_content,
- "line_count": len(lines),
- "word_count": len(words),
- "character_count": characters,
- "lines": lines
- }
-
- logger.info(f"文本统计: {len(lines)} 行, {len(words)} 词, {characters} 字符")
- return result
- def process_analyze_words(text_data: Dict) -> Dict:
- """
- 步骤2: 分析词频
-
- Args:
- text_data: 文本数据
-
- Returns:
- 词频分析结果
- """
- logger.info("正在分析词频...")
-
- text = text_data["original_text"]
- words = re.findall(r'\b\w+\b', text.lower())
-
- # 统计词频
- word_freq = {}
- for word in words:
- word_freq[word] = word_freq.get(word, 0) + 1
-
- # 排序
- sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
-
- result = {
- "word_frequency": dict(sorted_words[:20]), # 前20个最常见的词
- "total_unique_words": len(word_freq),
- "total_words": len(words)
- }
-
- logger.info(f"发现 {len(word_freq)} 个不同的词")
- return result
- def process_analyze_sentences(text_data: Dict) -> Dict:
- """
- 步骤3: 分析句子
-
- Args:
- text_data: 文本数据
-
- Returns:
- 句子分析结果
- """
- logger.info("正在分析句子...")
-
- text = text_data["original_text"]
-
- # 简单的句子分割(基于句号、问号、感叹号)
- sentences = re.split(r'[.!?]+', text)
- sentences = [s.strip() for s in sentences if s.strip()]
-
- # 统计句子长度
- sentence_lengths = [len(s.split()) for s in sentences]
-
- result = {
- "sentence_count": len(sentences),
- "average_sentence_length": sum(sentence_lengths) / len(sentence_lengths) if sentence_lengths else 0,
- "max_sentence_length": max(sentence_lengths) if sentence_lengths else 0,
- "min_sentence_length": min(sentence_lengths) if sentence_lengths else 0
- }
-
- logger.info(f"发现 {len(sentences)} 个句子")
- return result
- def process_generate_report(word_analysis: Dict, sentence_analysis: Dict, text_data: Dict) -> Dict:
- """
- 步骤4: 生成分析报告
-
- Args:
- word_analysis: 词频分析结果
- sentence_analysis: 句子分析结果
- text_data: 原始文本数据
-
- Returns:
- 分析报告
- """
- logger.info("正在生成分析报告...")
-
- report = {
- "text_statistics": {
- "total_lines": text_data["line_count"],
- "total_words": text_data["word_count"],
- "total_characters": text_data["character_count"]
- },
- "word_analysis": {
- "total_unique_words": word_analysis["total_unique_words"],
- "total_words": word_analysis["total_words"],
- "top_words": word_analysis["word_frequency"]
- },
- "sentence_analysis": {
- "sentence_count": sentence_analysis["sentence_count"],
- "average_length": sentence_analysis["average_sentence_length"],
- "max_length": sentence_analysis["max_sentence_length"],
- "min_length": sentence_analysis["min_sentence_length"]
- }
- }
-
- logger.info("分析报告生成完成")
- return report
|