Skip to content

Commit

Permalink
#23 Preliminary visualisation is working
Browse files Browse the repository at this point in the history
  • Loading branch information
ohtohalla committed Feb 23, 2022
1 parent 974df14 commit f40497d
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 0 deletions.
56 changes: 56 additions & 0 deletions flask_searchengine_vis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#!/usr/bin/env python3

from flask import Flask, render_template, request
from bs4 import BeautifulSoup as bs
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np


ARTICLES_FILE = "data/enwiki-20181001-corpus.1000-articles.txt"


document_dicts = []
with open(ARTICLES_FILE) as file:
soup = bs(file, 'lxml')
for article in soup.find_all('article'):
document_dicts.append({'name':article['name'], 'content':article.get_text(strip=True)})
documents = [d['content'] for d in document_dicts if 'content' in d]
tv = TfidfVectorizer(lowercase=True, sublinear_tf=True, use_idf=True, norm="l2")
t_matrix = tv.fit_transform(documents).T.tocsr()


def search_dict():
query = input("Search for: ")
matches = []
if query:
try:
query_vector = tv.transform([query]).tocsc()
hits = np.dot(query_vector, t_matrix)
ranked_scores_and_doc_ids = sorted(zip(np.array(hits[hits.nonzero()])[0], hits.nonzero()[1]), reverse=True)
for i, (score, id) in enumerate(ranked_scores_and_doc_ids):
matches.append({'hit':"{:d}".format(i+1), 'score':"{:.4f}".format(score), 'name':"{:s}".format(document_dicts[id]['name']), 'content':"{:.100s}…".format(document_dicts[id]['content'])})
except:
pass
return matches

def visualise_search(input_dict):
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

matches_df = pd.DataFrame.from_dict(input_dict)
sns.set_theme()
sns.relplot(
data=matches_df,
x = "hit",
y = "score",
)
return plt.show()



if __name__ == "__main__":
matches = search_dict()
visualise_search(matches)

15 changes: 15 additions & 0 deletions init.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import os
from flask import Flask
import flask_searchengine


# create and configure the app
app = Flask(__name__, instance_relative_config=True)

@app.route('/hello')
def main():
matches = []
flask_searchengine.search()
return flask_searchengine.results(matches)


0 comments on commit f40497d

Please sign in to comment.