forked from a-nikolaev/study-in-scarlet
-
Notifications
You must be signed in to change notification settings - Fork 0
/
scarlet.rb
executable file
·88 lines (79 loc) · 2.34 KB
/
scarlet.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#! /usr/bin/env ruby
# Computes the similarity score between every pair of passed source code files.
# The program tokenizes each file with lexer Rouge, and then uses Sherlock
# to compute the difference between these tokenized files.
#
# Usage:
#
# ./scarlet.rb [-t threshold=0.0] [-z zerobits=0] [-n chainlength=10] file1 file2 file3 ...
#
# Options -t, -z, -n replicate Sherlock's arguments with threshold (-t) rescaled to the interval [0,1],
# and with different default values: threshold=0.0, zerobits=0, chainlength=10.
#
# Examples:
#
# ./scarlet.rb path/*.py
#
# ./scarlet.rb -t0.7 path/*.py
#
# ./scarlet.rb -t0.5 -z1 -n7 *.cpp
#
require 'tmpdir'
require 'rouge'
require 'optparse'
def lex(filename, output_filename, lang)
source = File.read(filename)
# lexer = Rouge::Lexer.guess_by_filename(filename).new
if lang == "cpp"
lexer = Rouge::Lexers::Cpp.new
elsif lang == "python"
lexer = Rouge::Lexers::Python.new
elsif lang == "java"
lexer = Rouge::Lexers::Java.new
else
print("Invalid languge")
end
out_file = File.open(output_filename, 'w+')
lexer.lex(source).each{|tok, chunk|
if tok.shortname != '' && tok.shortname[0] != 'c' && tok.shortname[0] != 'p' # ignore comments
#puts "#{tok.shortname} #{chunk.inspect}"
short = tok.shortname
if (short[0] == 'k' || short[0] == 'o')
out_file.print "#{short}_#{chunk}"
else
out_file.print short
end
out_file.print ' '
end
}
out_file.close
end
# Read CLI options (for Sherlock)
options = {:t => 0.0, :z => 0, :n => 10}
OptionParser.new do |opts|
opts.on('-t NUM', Float)
opts.on('-z NUM', Integer)
opts.on('-n NUM', Integer)
opts.on('-l STR', String)
end.parse!(into: options)
# Use Rouge Lexer
files = ARGV
if files.size == 0
exit(1)
end
dirs = files.map{|f| File.dirname(f)}
all_in_same_dir = dirs.all?{|d| d==dirs[0]}
Dir.mktmpdir {|tempdir|
ARGV.each{|file|
out_basename = 'x'
if all_in_same_dir
out_basename = File.basename(file).gsub(/\s/, '-') # replace whitespace with '-'
else
out_basename = file.gsub(/\/|\.|\s/,'-') # replace '/', '.', and whitespace with '-'
end
lex(file, "#{tempdir}/#{out_basename}", options[:l])
}
# Run sherlock
sherlock = './better-sherlock/sherlock'
system("#{sherlock} -t #{options[:t]} -n #{options[:n]} -z #{options[:z]} #{tempdir}/*")
}