Skip to content

jainpranj/github-neo4j

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

21 Commits
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 

Repository files navigation

#CYPHER QUERIES

##GITHUB PYTHON PACKAGE ANALYSIS -RELATIONSHIP BETWEEN LANGUAGES ##LOAD LANGUAGES

LOAD CSV WITH HEADERS FROM "file:///github-clustering/packages_in_file_nodes.csv" AS csvLine CREATE (p:Packages { packagename: csvLine.package, count: toInt(csvLine.count) })

##LOAD RELATIONSHIPS

USING PERIODIC COMMIT 500 LOAD CSV WITH HEADERS FROM "file:///github-clustering/packages_in_file_edges_py.csv" AS csvLine MATCH (package:Packages { packagename: csvLine.package1}),(package2:Packages { packagename: csvLine.package2}) MERGE (package)-[:DEPENDS { count: csvLine.count }]->(package2)

##QUERIES

Match(p:Packages)-[d:DEPENDS]->(n) Where p.packagename = 'django' Return n order by d.count desc limit 10

Match(p:Packages)-->(n) Where p.packagename = 'django' Return n

Match(p:Packages)-->(n) Where p.packagename = 'os' Return n

MATCH (n:Packages) RETURN n LIMIT 25

CREATE INDEX ON :Packages(packagename)

##BIG QUERY

SELECT type, repo.name, created_at,actor.login, JSON_EXTRACT(payload, '$.action') as event, FROM (TABLE_DATE_RANGE([githubarchive:day.], TIMESTAMP('2017-04-22'), TIMESTAMP('2017-04-22') )) WHERE type = 'PushEvent' LIMIT 100

SELECT type, repo.name, created_at,actor.login, JSON_EXTRACT(payload, '$.action') as event, FROM (TABLE_DATE_RANGE([githubarchive:day.], TIMESTAMP('2017-04-22'), TIMESTAMP('2017-04-22') )) WHERE type = 'IssueCommentEvent' LIMIT 1000

##LOAD GITHUB EVENTS DATA-4 EVENTS FORK PULL PUSH ISSUECOMMENT

LOAD CSV WITH HEADERS FROM "file:///github-events/pushevent.csv" AS line MERGE (u:User {name:line.actor_login}) CREATE (ev:PushEvent {time:toInt(line.created_at) }) MERGE (u)-[:DO]->(ev) MERGE (repo:Repository {id:toInt(line.created_at)}) SET repo.name=line.repo_name MERGE (owner:User {name:line.actor_login}) MERGE (repo)-[:OWNED_BY]->(owner)

LOAD CSV WITH HEADERS FROM "file:///github-events/pullrequestevent.csv" AS line MERGE (u:User {name:line.actor_login}) CREATE (ev:PullRequestEvent {time:toInt(line.created_at), event:line.event }) MERGE (u)-[:DO]->(ev) MERGE (repo:Repository {id:toInt(line.created_at)}) SET repo.name=line.repo_name MERGE (owner:User {name:line.actor_login}) MERGE (repo)-[:OWNED_BY]->(owner)

LOAD CSV WITH HEADERS FROM "file:///github-events/forkevent.csv" AS line MERGE (u:User {name:line.actor_login}) CREATE (ev:ForkEvent {time:toInt(line.created_at) }) MERGE (u)-[:DO]->(ev) MERGE (repo:Repository {id:toInt(line.created_at)}) SET repo.name=line.repo_name MERGE (owner:User {name:line.actor_login}) MERGE (repo)-[:OWNED_BY]->(owner)

LOAD CSV WITH HEADERS FROM "file:///github-events/watchevent.csv" AS line MERGE (u:User {name:line.actor_login}) CREATE (ev:WatchEvent {time:toInt(line.created_at), event:line.event }) MERGE (u)-[:DO]->(ev) MERGE (repo:Repository {id:toInt(line.created_at)}) SET repo.name=line.repo_name MERGE (owner:User {name:line.actor_login}) MERGE (repo)-[:OWNED_BY]->(owner)

LOAD CSV WITH HEADERS FROM "file:///github-events/issuecomment.csv" AS line MERGE (u:User {name:line.actor_login}) CREATE (ev:IssueCommentEvent {time:toInt(line.created_at), event:line.event }) MERGE (u)-[:DO]->(ev) MERGE (repo:Repository {id:toInt(line.created_at)}) SET repo.name=line.repo_name MERGE (owner:User {name:line.actor_login}) MERGE (repo)-[:OWNED_BY]->(owner)

#With most events

MATCH (u:User)-[r:DO]->() RETURN u.name, count(r) as events ORDER BY events DESC LIMIT 1

#fork Events MATCH (u:User)-[r:DO]->(f:ForkEvent) RETURN u.name, count(r) as forkevents ORDER BY forkevents DESC LIMIT 1

#maximum touches on repo

MATCH (repo:Repository)-[r]->() RETURN repo.name, count(r) as touchs ORDER BY touchs DESC LIMIT 1

#HBASE COMMANDS

hbase org.apache.hadoop.hbase.mapreduce.ImportTsv -Dimporttsv.separator=',' -Dimporttsv.columns="HBASE_ROW_KEY,project:project_id,project:language,size:bytes,project:url,project:repo,time:measured_at,size:percent,size:total_bytes" 'githublanguages' hdfs://localhost:9000/hbase-data-languages.csv

#COLUMNS

project_id,language,bytes,url,repo,measured_at,percent,total_bytes

#create the table in hbase column family

create 'githublanguages','project','size','time'

#create this table in phoenix

CREATE TABLE "githublanguages" ( ROWKEY VARCHAR PRIMARY KEY, "project"."project_id" VARCHAR,"project"."language" VARCHAR,"size"."bytes" VARCHAR,"project"."url" VARCHAR,"project"."repo" VARCHAR,"time"."measured_at" VARCHAR,"size"."percent" VARCHAR,"size"."total_bytes" VARCHAR) ;

create index lang_index on table githublanguages(id,language) as 'lang'

#go to phoenix

./sqlline.py to start ./squirrel-start.sh

#PIG

queryData = LOAD '/home/pranjal/Downloads/github-neo4j-master/hbase-data-languages_new.csv' USING PigStorage(',') AS (ID:int,project_id:chararray,language:chararray,bytes:int,url:chararray,repo:chararray,measured_at:chararray,percent:chararray,total_bytes:chararray); groupNewData = GROUP queryData BY language; countData = FOREACH groupNewData GENERATE group, COUNT(queryData.language) AS topLanguages; orderedData = ORDER countData by topLanguages DESC; finalData = LIMIT orderedData 10; STORE finalData INTO 'outFile';

Releases

No releases published

Packages

No packages published