forked from benoitvallon/100-best-books
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextractions.py
131 lines (106 loc) · 4.18 KB
/
extractions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
from flask import Blueprint, render_template, flash, current_app
import wikipedia
import urllib2
import re
import json
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
extractions_routes = Blueprint('extractions_routes', __name__)
@extractions_routes.route('/extract-from-csv')
def extractFromCsv():
# Open/close a file
fileOpen = open("books.csv", "r")
fileData = fileOpen.readlines()
fileOpen.close()
books = []
for line in fileData:
line = line.split(';')
book = {
'title': line[0].decode('utf8'),
'author': line[1].decode('utf8'),
'link': '',
'pages': int(line[6].decode('utf8')),
'year': int(line[3].decode('utf8')),
'country': line[4].decode('utf8'),
'language': line[5].decode('utf8'),
'imageLink': 'images/' + line[7].decode('utf8')
}
if line[7]:
book['link'] = line[8].decode('utf8')
books.append(book)
flash('%d book(s) have been extracted from the .csv file' % len(books), 'success')
formattedJson = json.dumps(books, sort_keys=True, indent=2, separators=(',', ': '))
# Open/close a file
fileOpen = open("books.json", "w")
fileData = fileOpen.write(formattedJson)
fileOpen.close()
return render_template('list.html', books= books)
def requestWikipedia(page):
return wikipedia.page(page)
def requestIsbndb(book):
pageName = book['title']
pageName = pageName.replace("_", " ")
if book['author'] != 'Unknow':
pageName = pageName + " " + book['author']
print book
print pageName
pageNameEncoded = urllib2.quote(pageName.encode('UTF-8'))
print 'Request for: ' + pageNameEncoded
isbndbXml = urllib2.urlopen("http://isbndb.com/api/v2/xml/" + current_app.config['ISBNDB_KEY'] + "/books?q=" + pageNameEncoded).read()
bestResult = getIsbndbBestResult(isbndbXml)
book['description'] = bestResult.find('physical_description_text').text
isbn10 = bestResult.find('isbn10').text
book['imageLink'] = "http://covers.openlibrary.org/b/isbn/" + isbn10 + ".jpg"
return book
def getIsbndbBestResult(isbndbXml):
root = ET.fromstring(isbndbXml)
results = root.findall('data')
for result in results:
image = urllib2.urlopen("http://covers.openlibrary.org/b/isbn/" + result.find('isbn10').text + ".jpg")
print "http://covers.openlibrary.org/b/isbn/" + result.find('isbn10').text + ".jpg"
responseHeaders = image.info()
# is the result of the image load request a real image
if "content-type" in responseHeaders.keys():
print "image found"
# is there a description in the book
physicalDescriptionText = result.find('physical_description_text').text
print physicalDescriptionText
if physicalDescriptionText is not None and physicalDescriptionText != "":
print "description found"
reMatch = re.search(r'([0-9]*)\s*pages', physicalDescriptionText)
if reMatch:
print "pages in description: " + reMatch.group(1)
return result
else:
print "no pages description"
# return result
else:
print "no description"
else:
print "no image"
print 'All results processed'
return results[0]
@extractions_routes.route('/extract-from-apis')
def extractFromApis():
request = requestWikipedia("Bokklubben_World_Library")
htmlPage = BeautifulSoup(request.html(), 'html.parser')
table = htmlPage.select(".wikitable")[0]
lines = table.select('tr')
# we remove the first line of the table as it is the legend
lines = lines[1:3]
books = []
for line in lines:
cells = line.select('td')
title = cells[0].get_text().strip()
link = "https://en.wikipedia.org" + cells[0].find('a').get('href').strip() if cells[0].find('a') else ''
author = cells[1].find('a').get_text().strip() if cells[1].find('a') else ''
book = dict(title=title, author=author, link=link)
book = requestIsbndb(book)
books.append(book)
flash('%d book(s) have been extracted from wikipedia and other APIs' % len(books))
formattedJson = json.dumps(books, sort_keys=True, indent=2, separators=(',', ': '))
# # Open/close a file
# fileOpen = open("books.json", "w")
# fileData = fileOpen.write(formattedJson)
# fileOpen.close()
return render_template('list.html', books= books)