-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathparse.py
109 lines (84 loc) · 3.53 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
"""awesome list parser module.
This module takes repository url of "awesome list" on github and parses it to return a JSON object.
Author: Ankit Gyawali (https://github.com/ankitgyawali).
"""
import urllib.request
import json;
import re
def parseUrl(url):
# Modify URL to fetch raw data.
# TODO: Some URLs use 'readme.md' instead of 'README.md'. Try catch to use appropriate one.
url = url.replace("github", "raw.githubusercontent")+"/master/README.md"
# Awesome Object to return
awesome = {}
# Subheaders inside awesome list
awesome['title'] =[]
# Check for 404, return if 404
try:
subcontent = urllib.request.urlopen(url).read().decode("utf-8")
print ("Parsed: " +url)
except:
url = url.replace("README", "readme")
try:
subcontent = urllib.request.urlopen(url).read().decode("utf-8")
print ("Parsed: " +url)
except:
print ("Failed: " +url)
return
# Split by mainheader markdown "##"
mainHeaders = subcontent.split('\n## ')[1:-1]
# Split by subheader markdown "###" or "####"
for i, mainHeader in enumerate(mainHeaders):
mainHeaders[i] = re.split('\n### |\n#### ',mainHeaders[i])
# Always parse first section of header
mainHeaders[i][0] = mainHeaders[i][0].splitlines()
section = {}
section['name'] = mainHeaders[i][0][0]
section['links'] = []
section['subheaders'] = []
# Begin parsing first section of header
for j, sectionLinks in enumerate(mainHeaders[i][0]):
sectionDetails = {}
sectionDetails['url'] = sectionLinks.partition("(")[2].partition(")")[0]
sectionDetails['name'] = sectionLinks.partition("[")[2].partition("]")[0]
if(sectionDetails['url'].strip()!='' and ('http' in sectionDetails['url'])):
#Grab details while you are at it
if((not ('http' in sectionLinks.rsplit(") ", 1)[-1])) and (not ('[@' in sectionLinks.rsplit(") ", 1)[-1]))):
sectionDetails['detail'] = sectionLinks.rsplit(") ", 1)[-1]
section['links'].append(sectionDetails)
# TODO: Debug
# else:
# if(sectionDetails['url'].strip()!='' and sectionDetails['url'][1]=="#"):
# print (sectionDetails['url'])
mainHeaders[i].pop(0)
#print (len(mainHeaders[i]))
# If header has subheaders parse all of them
if (len(mainHeaders[i]) !=0):
# Loop through subsection Heading
for k, subHeaders in enumerate(mainHeaders[i]):
mainHeaders[i][k] = mainHeaders[i][k].splitlines()
subSection = {}
subSection['title'] = mainHeaders[i][k][0]
subSection['links'] = []
# DEBUG: print (mainHeaders[i][k][0])
#Loop through one sub section
for l, subHeadersSplit in enumerate(mainHeaders[i][k]):
subSectionDetails = {}
subSectionDetails['url'] = subHeadersSplit.partition("(")[2].partition(")")[0]
# print (subSectionDetails['url'])
subSectionDetails['name'] = subHeadersSplit.partition("[")[2].partition("]")[0]
#print (subHeadersSplit)
# print (subSectionDetails['url'])
if(subSectionDetails['url'].strip()!='' and ('http' in subSectionDetails['url'])):
#Grab details while you are at it
if((not ('http' in subHeadersSplit.rsplit(") ", 1)[-1])) and (not ('[@' in subHeadersSplit.rsplit(") ", 1)[-1]))):
subSectionDetails['detail'] = subHeadersSplit.rsplit(") ", 1)[-1]
subSection['links'].append(subSectionDetails)
#print (subSection)
section['subheaders'].append(subSection)
# Get rid of empty arrays
if(len(section['links'])!=0 or (len(section['subheaders'])!=0)):
if(len(section['subheaders'])==0):
del section['subheaders']
awesome['title'].append(section)
return awesome