-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathqueries-Webcrawl.json
1 lines (1 loc) · 17.8 KB
/
queries-Webcrawl.json
1
[{"value":"for url in urls\n remove url in urls","parameter":{},"name":"Delete all urls"},{"value":"for link in links\n filter link.symmetric != \"\"\n return distinct { \"srcurl\" : link.from, \"srcattr\" : link.attr, \"refattr\" : document(link.symmetric).attr, \"refurl\" : link.to }","parameter":{},"name":"List symmetric links"},{"value":"/* List Associations by searching for all urls with class \"Association\" */\nfor url in urls\n filter url.class == \"Association\"\n sort url.url\n return url.url","parameter":{},"name":"List Associations"},{"value":"let urlids = (\n for url in urls\n filter url.ctrlcnt > 1\n return url._id\n)\nfor urlid in urlids\n for link in links\n filter (link.attr == \"control\" && link._to == urlid) || (link.attr == \"contolledby\" && link._from == urlid)\n return { \"url\" : document(urlid).url, \"attr\" : link.attr, \"from\" : link.from, \"to\" : link.to }","parameter":{},"name":"List duplicate control"},{"value":"for url in urls\n filter url.controlled != \"\"\n return url","parameter":{},"name":"List controlled urls"},{"value":"/* List Vendors by searching for all urls with class \"Vendor\" */\nfor url in urls\n filter url.class == \"Vendor\"\n sort url.url\n return url.url","parameter":{},"name":"List Vendors"},{"value":"for url in urls\n filter url.class == \"\"\n return url","parameter":{},"name":"List untagged urls"},{"value":"/* List Publishers by searching for all urls with class \"Publisher\" */\nfor url in urls\n filter url.class == \"Publisher\"\n return url.url","parameter":{},"name":"List Publishers"},{"value":"for link in links\n remove link in links","parameter":{},"name":"Delete all links"},{"value":"/* List the url rank for Associations, Publishers, and Vendors sorted by number of links in descending order.\n */\n for url in urls\n filter (url.class == \"Association\") || (url.class == \"Publisher\") || (url.class == \"Vendor\")\n sort url.nlinks desc\n return { \"url\" : url.url, \"class\" : url.class, \"nlinks\" : url.nlinks, \"total\" : url.total, \"average\" : url.average }","parameter":{},"name":"List url ranking by nlinks"},{"value":"/* List the url rank for Associations, Publishers, and Vendors sorted by average link weight in descending order.\n */\n for url in urls\n filter (url.class == \"Association\") || (url.class == \"Publisher\") || (url.class == \"Vendor\")\n sort url.average desc\n return { \"url\" : url.url, \"class\" : url.class, \"nlinks\" : url.nlinks, \"total\" : url.total, \"average\" : url.average }","parameter":{},"name":"List url rank by average"},{"value":"let vertices = [\n{\n \"name\" : \"A\",\n \"weight\" : 1\n},\n{\n \"name\" : \"B\",\n \"weight\" : 1\n},\n{\n \"name\" : \"B\",\n \"weight\" : 10\n},\n{\n \"name\" : \"C\",\n \"weight\" : 1\n},\n{\n \"name\" : \"C\",\n \"weight\" : 1\n},\n{\n \"name\" : \"C\",\n \"weight\" : 10\n}\n]\nfor v in vertices\n collect name = v.name into g\n return { \"name\" : name, \"nlinks\" : count(g[*].v.weight), \"total\" : sum(g[*].v.weight), \"average\" : average(g[*].v.weight) }","parameter":{},"name":"x - Example"},{"value":"/* List shortest paths from JournalList to each Associations, Publishers, and Vendors\n */\nlet jlurl = (\nfor url in urls\n filter url.url == \"https://www.journallist.net/\"\n return url\n)\nfor url in urls\n filter ((url.class == \"Association\") || (url.class == \"Publisher\") || (url.class == \"Vendor\")) && (url._id != jlurl[0]._id)\n for path in outbound shortest_path jlurl[0]._id to url._id graph urlrank\n collect start = jlurl[0].url, target = url.url into groups \n return { \"start\" : start, \"target\" : target, \"nhops\" : count(groups[*].path), \"groups\" : groups[*].path.url }\n","parameter":{},"name":"List Shortest Paths to JournalList"},{"value":"/* Set link weight:\n social, contact, & disclosure = 0 as they are asymmetric links and have are self referntial\n belongto, control, & customer = 1 as potential false claims of membership, control, and customer are more likely \n member, controlledby, & vendor = 2 as false claims of membership, controlledby, and vendor are less likely\n symmetric links = 10 as they have the highest value as the claims are reciprocal\n */\nlet list1 = (\nfor link in links\n filter (link.attr == \"social\") || (link.attr == \"contact\") || (link.attr == \"social\")\n return { \"linkid\" : link._id, \"symmetric\" : \"\", \"weight\" : 0 }\n)\nlet list2 = (\nfor link in links\n filter (link.attr == \"belongto\") || (link.attr == \"controll\") || (link.attr == \"customer\")\n return { \"linkid\" : link._id, \"symmetric\" : \"\", \"weight\" : 1 }\n)\nlet list3 = (\nfor link in links\n filter (link.attr == \"member\") || (link.attr == \"controlledby\") || (link.attr == \"vendor\")\n return { \"linkid\" : link._id, \"symmetric\" : \"\", \"weight\" : 2 }\n)\nlet list4 = (\nfor from in links\n for to in links\n filter to._from == from._to && from._from == to._to && from._id != to._id && from.attr != to.attr\n return { \"linkid\" : from._id, \"symmetric\" : to._id, \"weight\" : 10 }\n)\nlet list5 = append (list1, list2 )\nlet list6 = append (list3, list4)\nfor link in append (list5, list6)\n update { _key : document(link.linkid)._key, symmetric : link.symmetric, weight : link.weight } in links\n return link\n","parameter":{},"name":"Set links class and weight"},{"value":"/* 01 - Set link symmetry and weight:\n social, contact, & disclosure = 0 as they are asymmetric links and are self referntial\n belongto, control, & customer = 1 as potential false claims of membership, control, and customer are more likely \n member, controlledby, & vendor = 2 as false claims of membership, controlledby, and vendor are less likely\n symmetric links = 10 as they have the highest value as the claims are reciprocal (e.g., from == to && to == from && not self referential)\n */\nlet list1 = (\nfor link in links\n filter (link.attr == \"social\") || (link.attr == \"contact\") || (link.attr == \"disclosure\")\n return { \"linkid\" : link._id, \"symmetric\" : \"\", \"weight\" : 0 }\n)\nlet list2 = (\nfor link in links\n filter (link.attr == \"belongto\") || (link.attr == \"control\") || (link.attr == \"customer\")\n return { \"linkid\" : link._id, \"symmetric\" : \"\", \"weight\" : 1 }\n)\nlet list3 = (\nfor link in links\n filter (link.attr == \"member\") || (link.attr == \"controlledby\") || (link.attr == \"vendor\")\n return { \"linkid\" : link._id, \"symmetric\" : \"\", \"weight\" : 2 }\n)\nlet list4 = (\nfor from in links\n for to in links\n filter to._from == from._to && from._from == to._to && from._id != to._id && from.attr != to.attr\n return { \"linkid\" : from._id, \"symmetric\" : to._id, \"weight\" : 10 }\n)\nlet list5 = append (list1, list2 )\nlet list6 = append (list3, list4)\nfor link in append (list5, list6)\n update { _key : document(link.linkid)._key, symmetric : link.symmetric, weight : link.weight } in links\n return link\n","parameter":{},"name":"01 - Set link symmetry and weight"},{"value":"/* 02 - Set url classification by searching links for all \"member\", \"belongto\", \"vendor\", \"customer\", \"social\", \"contact\", and \"disclosure\" attributes\n and setting class accordingly. Attributes of \"control\" and \"controlled\", as well as \"Publisher\" classification are set with subsequent queries.\n \"Association\" - only Associations have members, but Associations, Publishers and Vendors can belong to Assoications\n \"Vendor\" - only Vendors have customers\n */\nlet list1 = (\nfor link in links\n filter ((link.attr == \"social\") || (link.attr == \"contact\") || (link.attr == \"disclosure\")) && (link._to != link._from)\n return distinct { \"urlid\" : link._to , \"class\" : link.attr }\n)\nlet list2 = (\nfor link in links\n filter (link.attr == \"member\")\n return distinct { \"urlid\" : link._from, \"class\" : \"Association\" }\n)\nlet list3 = (\nfor link in links\n filter link.attr == \"belongto\"\n return distinct { \"urlid\" : link._to, \"class\" : \"Association\" }\n)\nlet list4 = (\nfor link in links\n filter (link.attr == \"customer\")\n return distinct { \"urlid\" : link._from, \"class\" : \"Vendor\" }\n)\nlet list5 = (\nfor link in links\n filter link.attr == \"vendor\"\n return distinct { \"urlid\" : link._to, \"class\" : \"Vendor\" }\n)\nlet list6 = append (list1, list2)\nlet list7 = append (list3, list4)\nlet list8 = append (list6, list7)\nfor url in append (list8, list5)\n update { _key : document(url.urlid)._key, class : url.class } in urls\n return url\n","parameter":{},"name":"02 - Set url classification"},{"value":"/* 03 - Set controlled url classifcation by searching all \"control\" and \"controlledby\" links for the controlling and controlled urls, then updaing the \"controlled\" field to point to the controlling url, setting the control count to 1, and setting the classification from the associated url.\n */\nlet list1 = (\nfor link in links\n filter link.attr == \"control\" && document(link._from).class != \"\"\n return distinct { \"urlid\" : link._to, \"ctrlid\" : link._from, \"class\" : document(link._from).class }\n)\nlet list2 = (\nfor link in links\n filter link.attr == \"controlledby\" && document(link._to).class != \"\"\n return distinct { \"urlid\" : link._from, \"ctrlid\" : link._to, \"class\" : document(link._to).class }\n)\nfor url in append (list1, list2)\n update { _key : document(url.urlid)._key, controlled : url.ctrlid, ctrlcnt : 1, class : url.class } in urls\n return url","parameter":{},"name":"03 - Set controlled url classification"},{"value":"/* 04 - Set Publisher url classification by setting any urls not previously classified as Association, Vendor, social, contact, or disclosure to \"Publisher\" */\nfor url in urls\n filter url.class == \"\"\n update { _key : url._key, class : \"Publisher\" } in urls\n return url.url","parameter":{},"name":"04 - Set Publisher url classification"},{"value":"/* 05 - Tag Duplicate Controls - tag urls with duplicate controlling urls by finding nonsymmetric links from different ctrlurls to the same url.\n */\nlet list1 = (\nfor link in links\n filter link.attr == \"control\" && link.symmetric == \"\"\n return { \"urlid\" : link._to, \"ctrlid\" : link._from }\n)\nlet list2 = (\nfor link in links\n filter link.attr == \"controlledby\" && link.symmetric == \"\"\n return { \"urlid\" : link._from, \"ctrlid\" : link._to }\n)\nfor url in append (list1, list2)\n collect urlid = url.urlid with count into ctrlcnt\n update { _key : document(urlid)._key, ctrlcnt : ctrlcnt } in urls\n return { \"urlid\" : urlid, \"ctrlcnt\" : ctrlcnt }","parameter":{},"name":"05 - Tag Duplicate Controls"},{"value":"/* 06 - Compute url ranking for Associations, Publishers, and Vendors by counting the number of links to the url, the total weight of those links, and\n the average weight of those links, then updating the url document accordingly.\n */\nlet list1 = (\n for url in urls\n filter (url.class == \"Association\") || (url.class == \"Publisher\") || (url.class == \"Vendor\")\n return url\n)\nlet list2 = (\n for url in list1\n for v, e in 1..1 any url graph \"urlrank\"\n return { \"urlid\" : v._id, \"weight\" : e.weight }\n)\nlet list3 = (\nfor v in list2\n collect urlid = v.urlid into g\n return { \"urlid\" : urlid, \"nlinks\" : count(g[*].v.weight), \"total\" : sum(g[*].v.weight), \"average\" : average(g[*].v.weight) }\n)\nfor url in list3\n update { _key : document(url.urlid)._key,\n nlinks : url.nlinks, \n total : url.total, \n average : url.average } in urls\n return url","parameter":{},"name":"06 - Compute url ranking"},{"value":"for link in links\n filter ((link.attr == \"member\") || (link.attr == \"belongto\") || (link.attr == \"vendor\") || (link.attr == \"customer\") || (link.attr == \"control\") || (link.attr == \"controlledby\")) && link.symmetric == \"\"\n return { \"link.from\" : link.from, \"link.attr\" : link.attr, \"link.to\" : link.to, \"link.symmetric\" : link.symmetric }","parameter":{},"name":"List asymmetric links"},{"value":"/* List member links where to field has subdirectories\n */\nfor link in links\n filter link.attr == \"member\" and link.to =~ \"https://.*/.*/$\"\n for back in links\n filter back.attr == \"belongto\" and back.to == link.from and back.from like link.to\nreturn { \"back.from\" : back.from, \"back.attr\" : back.attr, \"back.to\" : back.to, \"link.from\" : link.from, \"link.attr\" : link.attr, \"link.to\" : link.to }","parameter":{},"name":"Test"},{"value":"/* List Associations by member by looking at all inbound and outbound links to associations.\n */\nlet list1 = (\n for link in links\n filter link.attr == \"belongto\"\n collect from = link.from into groups\n return { \"member\" : from, \"associations\" : groups[*].link.to }\n)\nlet list2 = (\n for link in links\n filter link.attr == \"member\"\n collect to = link.to into groups\n return { \"member\" : to, \"associations\" : groups[*].link.from }\n)\nfor list in append(list1, list2)\n collect member = list.member aggregate associations = sorted_unique(list.associations)\n return { \"member\" : member, \"count\" : count(unique(associations[**])), \"associations\" : sorted_unique(associations[**]) }\n","parameter":{},"name":"List Associations by member"},{"value":"/* List Association membership by looking at all inbound and outbound links to associations.\n */\nlet list1 = (\n for link in links\n filter link.attr == \"belongto\"\n collect to = link.to into groups\n return { \"association\" : to, \"members\" : groups[*].link.from }\n)\nlet list2 = (\n for link in links\n filter link.attr == \"member\"\n collect from = link.from into groups\n return { \"association\" : from, \"members\" : groups[*].link.to }\n)\nfor list in append(list1, list2)\n collect association = list.association aggregate members = sorted_unique(list.members)\n return { \"association\" : association, \"count\" : count(unique(members[**])), \"members\" : sorted_unique(members[**]) }\n","parameter":{},"name":"List Association membership"},{"value":"/* List common Association membership by looking at all inbound and outbound links to associations and finding common membership.\n */\nlet list1 = (\n for link in links\n filter link.attr == \"belongto\"\n collect from = link.from into groups\n return { \"member\" : from, \"associations\" : groups[*].link.to }\n)\nlet list2 = (\n for link in links\n filter link.attr == \"member\"\n collect to = link.to into groups\n return { \"member\" : to, \"associations\" : groups[*].link.from }\n)\nlet list3 = (\n for list in append(list1, list2)\n collect member = list.member aggregate associations = sorted_unique(list.associations)\n return { \"member\" : member, \"associations\" : sorted_unique(associations[**]) }\n)\nlet list4 = (\n for member in list3\n filter count(member.associations) > 1\n return member\n)\nfor url in urls\n filter url.class == \"Association\"\n for mem in list4\n filter url.url in mem.associations\n collect association = url.url aggregate associations = sorted_unique(mem.associations[**])\n return distinct { \"association\" : association, \"count\" : count(sorted_unique(associations[**])), \"list\" : sorted_unique(associations[**]) }","parameter":{},"name":"List common Association membership"},{"value":"/* Remove self referential links\n */\nfor link in links\n filter link.to == link.from\n remove link in links\n","parameter":{},"name":"00 - Remove self referential links"},{"value":"for link in links\n filter link.symmetric != \"\" and link.attr == \"belongto\" and document(link._from).class == \"Publisher\"\n return distinct { \"srcurl\" : link.from, \"class\" : document(link._from).class, \"srcattr\" : link.attr, \"refurl\" : link.to, \"average\" : document(link._from).average }","parameter":{},"name":"List Publishers with symmetric membership links"},{"value":"/* List backward links for reference urls with subdirectories\n */\nlet listm = (\nfor link in links\n filter link.attr == \"member\" and link.to =~ \"https://.*/.*/$\"\nreturn { \"from\" : link.from, \"attr\" : link.attr, \"to\" : link.to, \"back\" : regex_matches (link.to, \"https://[^/]*/\") }\n)\n\nlet listmb = (\nfor link in listm\n for back in links\n filter back.attr == \"belongto\" and back.from == link.back[0]\nreturn distinct { \"srcurl\" : link.to, \"attr\" : back.attr, \"refurl\" : back.to }\n)\n\nlet listc = (\nfor link in links\n filter link.attr == \"control\" and link.to =~ \"https://.*/.*/$\"\nreturn { \"from\" : link.from, \"attr\" : link.attr, \"to\" : link.to, \"back\" : regex_matches (link.to, \"https://[^/]*/\") }\n)\n\nlet listcb = (\nfor link in listm\n for back in links\n filter back.attr == \"controlledby\" and back.from == link.back[0]\nreturn distinct { \"srcurl\" : link.to, \"attr\" : back.attr, \"refurl\" : back.to }\n)\n\nlet listv = (\nfor link in links\n filter link.attr == \"customer\" and link.to =~ \"https://.*/.*/$\"\nreturn { \"from\" : link.from, \"attr\" : link.attr, \"to\" : link.to, \"back\" : regex_matches (link.to, \"https://[^/]*/\") }\n)\n\nlet listvb = (\nfor link in listm\n for back in links\n filter back.attr == \"vendor\" and back.from == link.back[0]\nreturn distinct { \"srcurl\" : link.to, \"attr\" : back.attr, \"refurl\" : back.to }\n)\n\nlet list = append (listmb, listcb)\nfor link in append (list, listvb)\nreturn { \"srcurl\" : link.srcurl, \"attr\" : link.attr, \"refurl\" : link.refurl }\n","parameter":{},"name":"List backward links for reference urls with subdirectories"}]