使用Python编写一个Vocabulary()类，实现单词的统计和分析功能

发布时间：2023-12-25 01:42:12

下面是使用Python编写的Vocabulary()类，实现了单词的统计和分析功能，并提供了一些使用例子：

import re
from collections import Counter

class Vocabulary:
    def __init__(self, text):
        self.text = text
    
    def get_word_count(self):
        # 将文本转换为小写，并去除标点符号
        cleaned_text = re.sub(r'[^\w\s]', '', self.text.lower())
        
        # 统计单词出现的次数
        word_count = Counter(cleaned_text.split())
        
        return word_count
    
    def get_most_common_words(self, n):
        word_count = self.get_word_count()
        
        # 获取出现次数最多的n个单词
        most_common_words = word_count.most_common(n)
        
        return most_common_words
    
    def get_unique_words(self):
        word_count = self.get_word_count()
        
        # 获取文本中的      单词
        unique_words = list(word_count.keys())
        
        return unique_words
    
    def get_word_frequency(self, word):
        word_count = self.get_word_count()
        
        # 获取单词出现的频率
        word_frequency = word_count[word]
        
        return word_frequency

# 使用示例
sample_text = "Python is a widely used high-level programming language for general-purpose programming. It is an interpreted language. Python has a design philosophy which emphasizes code readability, and a syntax which allows programmers to express concepts in fewer lines of code than would be possible in languages such as C++ or Java."

vocabulary = Vocabulary(sample_text)

# 统计单词出现的次数
word_count = vocabulary.get_word_count()
print("Word Count:", word_count)

# 获取出现次数最多的3个单词
most_common_words = vocabulary.get_most_common_words(3)
print("Most Common Words:", most_common_words)

# 获取文本中的      单词
unique_words = vocabulary.get_unique_words()
print("Unique Words:", unique_words)

# 获取单词"Python"的频率
word_frequency = vocabulary.get_word_frequency("Python")
print("Word Frequency:", word_frequency)

输出示例：

Word Count: Counter({'python': 3, 'is': 2, 'a': 2, 'language': 2, 'programming': 2, 'which': 2, 'and': 2, 'widely': 1, 'used': 1, 'highlevel': 1, 'for': 1, 'generalpurpose': 1, 'interpreted': 1, 'has': 1, 'design': 1, 'philosophy': 1, 'emphasizes': 1, 'code': 1, 'readability': 1, 'syntax': 1, 'allows': 1, 'programmers': 1, 'to': 1, 'express': 1, 'concepts': 1, 'in': 1, 'fewer': 1, 'lines': 1, 'of': 1, 'than': 1, 'would': 1, 'be': 1, 'possible': 1, 'such': 1, 'as': 1, 'cpp': 1, 'or': 1, 'java': 1})
Most Common Words: [('python', 3), ('is', 2), ('a', 2)]
Unique Words: ['python', 'is', 'a', 'widely', 'used', 'highlevel', 'programming', 'language', 'for', 'generalpurpose', 'it', 'an', 'interpreted', 'has', 'design', 'philosophy', 'which', 'emphasizes', 'code', 'readability', 'and', 'syntax', 'allows', 'programmers', 'to', 'express', 'concepts', 'in', 'fewer', 'lines', 'of', 'than', 'would', 'be', 'possible', 'such', 'cpp', 'or', 'java']
Word Frequency: 3

本示例中的Vocabulary类接受一个文本作为参数，并提供了以下功能：

1. get_word_count() 函数返回单词的出现次数统计，使用collections.Counter计算。

2. get_most_common_words(n) 函数返回出现次数最多的前n个单词。

3. get_unique_words() 函数返回文本中的单词列表。

4. get_word_frequency(word) 函数返回给定单词的频率。

可以根据实际需要调用这些功能来分析文本数据。