forked from Ruturaj4/Search-Engine
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpre-processing.py.bak
73 lines (54 loc) · 1.74 KB
/
pre-processing.py.bak
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# Name: pre-processing.py
# Author: Shaina Krumme
# Date: 18 March 2018
# Install Natural Language Toolkit (NLTK) using the following command:
# run sudo pip install -U nltk
# Download NLTK stopwords
# nltk.download('stopwords')
# Support for regular expressions.
import re
# Implementing a stop list and stemmer.
from nltk.stem import *
from nltk.stem.porter import *
from nltk.corpus import stopwords
import nltk
import os
# Pre-process the documents by removing all HTML tags and convert everything
# into lower case.
def removeTags(html):
removetags = re.compile('<.*?>')
plain = re.sub(removetags, '', html)
return plain
def toLowercase(text):
return text.lower()
# Implement a stop list and a stemmer to pre-process the documents (for the stop
# list and stemmer, you are allowed to use third-party open source code).
def filterStopWords(text):
stopWords = set(stopwords.words('english'))
filtered = [i for i in text.split() if i not in stopWords]
return filtered
def stemmer(text):
stemmer = PorterStemmer()
stem = []
for words in text:
stem.append(stemmer.stem(words))
return stem
# Build an inverted index (including dictionary and posting lists) for the
# documents. Please make sure to keep all the frequency information.
# def invertedIndex(documents):
#
# return index
def main():
counter = 0
directory = os.listdir('pre-processing-files')
for file in directory:
open_file = open(file, 'r')
read_file = open_file.read()
plain = removeTags(read_file)
lowerCase = toLowercase(plain)
filtered = filterStopWords(lowerCase)
stem = stemmer(filtered)
counter += 1
print "Number of files ", counter
if __name__ == "__main__":
main()