使用Python编写一个Vocabulary()类,实现单词的统计和分析功能
发布时间:2023-12-25 01:42:12
下面是使用Python编写的Vocabulary()类,实现了单词的统计和分析功能,并提供了一些使用例子:
import re
from collections import Counter
class Vocabulary:
def __init__(self, text):
self.text = text
def get_word_count(self):
# 将文本转换为小写,并去除标点符号
cleaned_text = re.sub(r'[^\w\s]', '', self.text.lower())
# 统计单词出现的次数
word_count = Counter(cleaned_text.split())
return word_count
def get_most_common_words(self, n):
word_count = self.get_word_count()
# 获取出现次数最多的n个单词
most_common_words = word_count.most_common(n)
return most_common_words
def get_unique_words(self):
word_count = self.get_word_count()
# 获取文本中的 单词
unique_words = list(word_count.keys())
return unique_words
def get_word_frequency(self, word):
word_count = self.get_word_count()
# 获取单词出现的频率
word_frequency = word_count[word]
return word_frequency
# 使用示例
sample_text = "Python is a widely used high-level programming language for general-purpose programming. It is an interpreted language. Python has a design philosophy which emphasizes code readability, and a syntax which allows programmers to express concepts in fewer lines of code than would be possible in languages such as C++ or Java."
vocabulary = Vocabulary(sample_text)
# 统计单词出现的次数
word_count = vocabulary.get_word_count()
print("Word Count:", word_count)
# 获取出现次数最多的3个单词
most_common_words = vocabulary.get_most_common_words(3)
print("Most Common Words:", most_common_words)
# 获取文本中的 单词
unique_words = vocabulary.get_unique_words()
print("Unique Words:", unique_words)
# 获取单词"Python"的频率
word_frequency = vocabulary.get_word_frequency("Python")
print("Word Frequency:", word_frequency)
输出示例:
Word Count: Counter({'python': 3, 'is': 2, 'a': 2, 'language': 2, 'programming': 2, 'which': 2, 'and': 2, 'widely': 1, 'used': 1, 'highlevel': 1, 'for': 1, 'generalpurpose': 1, 'interpreted': 1, 'has': 1, 'design': 1, 'philosophy': 1, 'emphasizes': 1, 'code': 1, 'readability': 1, 'syntax': 1, 'allows': 1, 'programmers': 1, 'to': 1, 'express': 1, 'concepts': 1, 'in': 1, 'fewer': 1, 'lines': 1, 'of': 1, 'than': 1, 'would': 1, 'be': 1, 'possible': 1, 'such': 1, 'as': 1, 'cpp': 1, 'or': 1, 'java': 1})
Most Common Words: [('python', 3), ('is', 2), ('a', 2)]
Unique Words: ['python', 'is', 'a', 'widely', 'used', 'highlevel', 'programming', 'language', 'for', 'generalpurpose', 'it', 'an', 'interpreted', 'has', 'design', 'philosophy', 'which', 'emphasizes', 'code', 'readability', 'and', 'syntax', 'allows', 'programmers', 'to', 'express', 'concepts', 'in', 'fewer', 'lines', 'of', 'than', 'would', 'be', 'possible', 'such', 'cpp', 'or', 'java']
Word Frequency: 3
本示例中的Vocabulary类接受一个文本作为参数,并提供了以下功能:
1. get_word_count() 函数返回单词的出现次数统计,使用collections.Counter计算。
2. get_most_common_words(n) 函数返回出现次数最多的前n个单词。
3. get_unique_words() 函数返回文本中的 单词列表。
4. get_word_frequency(word) 函数返回给定单词的频率。
可以根据实际需要调用这些功能来分析文本数据。
