读入web原始文本
from urllib import request
url = "http://www.gutenberg.org/files/2554/2554.txt"
response = request.urlopen(url)
raw = response.read().decode('utf8')
type(raw) #<class 'str'>
读取本地原始文本
f = open('document.txt')
raw = f.read()
path = nltk.data.find('corpora/gutenberg/melville-moby_dick.txt')
raw = open(path, 'rU').read()
获取用户输入
s = input("Enter some text: ")
print("You typed", len(word_tokenize(s)), "words.")
原始文本本身为字符串格式,可以用字符串的函数处理
raw.find("PART I")
raw = raw[5338:1157743]
从原始文本中提取出词,并封装至text
tokens = word_tokenize(raw)
type(tokens) #<class 'list'>
text = nltk.Text(tokens)
type(text) #<class 'nltk.text.Text'>
用正则表达式进行文本模式匹配
import re
[w for w in wordlist if re.search('ed$', w)]
[w for w in wordlist if re.search('^..j..t..$', w)]
[w for w in wordlist if re.search('^[ghi][mno][jlk][def]$', w)]
Operator Behavior
. #Wildcard, matches any character
^abc #Matches some pattern abc at the start of a string
abc$ #Matches some pattern abc at the end of a string
[abc] #Matches one of a set of characters
[A-Z0-9] #Matches one of a range of characters
ed|ing|s #Matches one of the specified strings (disjunction)
* #Zero or more of previous item, e.g. a*, [a-z]* (also known as *Kleene Closure*)
+ #One or more of previous item, e.g. a+, [a-z]+
? #Zero or one of the previous item (i.e. optional), e.g. a?, [a-z]?
{n} #Exactly n repeats where n is a non-negative integer
{n,} #At least n repeats
{,n} #No more than n repeats
{m,n} #At least m and no more than n repeats
a(b|c)+ #Parentheses that indicate the scope of the operators
规则化文本
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()
[porter.stem(t) for t in tokens] #better
[lancaster.stem(t) for t in tokens]
分割句子
text = nltk.corpus.gutenberg.raw('chesterton-thursday.txt')
sents = nltk.sent_tokenize(text)
pprint.pprint(sents[79:89])