-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsentenceSegmentation.py
58 lines (38 loc) · 1020 Bytes
/
sentenceSegmentation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
from util import *
# Add your import statements here
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters
import re
class SentenceSegmentation():
def naive(self, text):
"""
Sentence Segmentation using a Naive Approach
Parameters
----------
arg1 : str
A string (a bunch of sentences)
Returns
-------
list
A list of strings where each string is a single sentence
"""
segmentedText = None
#Fill in code here
segmentedText= re.split('\.\s|\?\s|\!\s', text)
return segmentedText
def punkt(self, text):
"""
Sentence Segmentation using the Punkt Tokenizer
Parameters
----------
arg1 : str
A string (a bunch of sentences)
Returns
-------
list
A list of strings where each string is a single sentence
"""
segmentedText = None
punkt_params = PunktParameters()
punkt_params.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc'])
segmentedText= PunktSentenceTokenizer(punkt_params).tokenize(text)
return segmentedText