-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathast_token_extractor.py
133 lines (99 loc) · 4.76 KB
/
ast_token_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import json
from tqdm import tqdm
# part 1
"""
Given any AST node as a JSON string, extract the token and return a string. This token is what will get passed through Word2Vec to become a vector.
"""
from typing import List, Set, Dict, Tuple, Union, Callable
def ast2id_or_lit (node:dict) -> Union[str,None]:
"""pulls identifier or literal out of abstract syntax tree
Args:
python dictionary (json file)
Returns:
string of token LIT:name or ID:name or None
"""
# soft type check on input
#import json
#node == json.dumps(json.loads(node))
# not sure why these are in the data
## acorn devs do what they want
if node == 0 : return None
# the json data we are actually tokenizing
if (node["type"] == "VariableDeclarator" or
node["type"] == "Identifier" or
node["type"] == "Property" ) :
return "ID:"+node["value"]
# all javascript primitive literals seem to start with "literal"
if node["type"][0:len("Literal")] == "Literal" :
return "LIT:"+node["value"]
# else
return None
def get_all_tokens_from(ast, remove_none=True):
"""Extracts all tokens from a single AST in JSON format.
Args:
ast (list): The AST, as a list of dictionaries, each entry corresponding to a node.
remove_none (bool, optional): If True, removes all `None` tokens, and leaves them in otherwise. Defaults to True.
Returns:
list: A list of all tokens extracted from this AST.
"""
all_tokens = []
for node in ast:
try:
token = ast2id_or_lit(node)
if remove_none and token is None:
continue
all_tokens.append(token)
except:
pass
return all_tokens
def get_tokens_from_corpus(all_asts_filepath:str, remove_none=True):
"""Given a file that contains one source file AST JSON per line, extracts all tokens.
Args:
all_asts_filepath (str): Filepath to a file that contains one source file AST JSON per line.
remove_none (bool, optional): If True, removes all `None` tokens, and leaves them in otherwise. Defaults to True.
Returns:
list: A list, where each entry is the list of tokens extracted from one source file AST.
"""
list_of_lists_of_tokens = []
with open(all_asts_filepath, errors="ignore") as all_asts_fp:
for ast_line in tqdm(all_asts_fp): # Iterate over all lines in file
ast = json.loads(ast_line) # Each line contains a new AST
all_tokens = get_all_tokens_from(ast, remove_none) # Collect all tokens from the AST, in order of appearance
list_of_lists_of_tokens.append(all_tokens) # Append all tokens, creating a large list
return list_of_lists_of_tokens
# oops, thought we were reading default acorn parsed output
## may still be useful
def _acorn_json (node:dict) -> Union[str,None]:
# common to all we are parsing
if node["type"] != "ExpressionStatement" :
return None
#If n is an identifier { "id":4, "type":"Identifier", "value":"console" }, return its name ID:console.
if node["expression"]["type"] == "Identifier" :
return "ID:"+node["expression"]["value"]
#If n is a literal { "id":6, "type":"Literal", "value":"hello" }, return a string representation of its value LIT:hello.
if node["expression"]["type"] == "Literal" :
return "LIT:"+node["expression"]["raw"]
#If n is a this expression, return LIT:this.
if node["expression"]["type"] == "ThisExpression" :
return "LIT:"+"this"
#If n is an update expression that increments or decrements x, return name (x).
if node["expression"]["type"] == "UpdateExpression" :
if (node["expression"]["operator"] == "++" or
node["expression"]["operator"] == "--") :
return "ID:"+node["expression"]["argument"]["name"]
# json is the same for member expressions . And []
if node["expression"]["type"] == "MemberExpression" :
#If n is a member expression base.prop that accesses a property, return name (prop).
if node["expression"]["property"]["type"] == "Identifier" :
return "ID:"+node["expression"]["property"]["name"]
#If n is a member expression base[k] that accesses an array element, return name (base).
if node["expression"]["property"]["type"] == "Literal" :
return "ID:"+node["expression"]["property"]["raw"]
#If n is a call expression base.callee (..), return name (callee).
if node["expression"]["type"] == "CallExpression" :
return "ID:"+node["expression"]["callee"]["name"]
#For any other AST node n, do not extract its name.
return None
# interface, sometimes referenced by these names
Token = ast2id_or_lit
token = ast2id_or_lit