Skip to content

Commit

Permalink
CopyrightArmor release v1
Browse files Browse the repository at this point in the history
  • Loading branch information
Copy05 committed Apr 29, 2024
1 parent 359d7d7 commit c52a675
Show file tree
Hide file tree
Showing 11 changed files with 202 additions and 33 deletions.
13 changes: 12 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
- **Web Scraping Engine**: CopyrightArmor uses a robust web scraping engine to crawl and analyze websites for potentially infringing content.
- **Recursive Scrapping**: Recursively scans websites.
- **Google Optimized**: by using `--google` and `--google-search` without `-url` the web scraping engine will be optimized for Google Search.
- **Google SERP Engine**: CopyrightArmor detects most of pirate sites on the Google Search results. ([always check for false positives](https://github.com/Copy05/CopyrightArmor/discussions/4))
- **Flexible**: You can configurate how and what type of links it should scrape:
- Exclude Social Media Links
- Exclude Query parameter links
Expand Down Expand Up @@ -48,6 +49,8 @@ pip install -r requirements.txt
```

4. Open up `src/hashes.json` and add all content that you want to scan for and use this syntax:

**`--google-search`** scans only for the content inside `"entertainment"`
```json
{
"images": [
Expand All @@ -65,6 +68,14 @@ pip install -r requirements.txt
"hash": "3b99f49776f433aeb000fa010e452879198fe4be7f6660552527b53304268343",
"description": "Another Pink Cherry Tree Exe"
}
],
"entertainment": [
{
"title": "EntertainRewind 2024",
"original_url": "example.com",
"copyright_owner": "Entertainmasters",
"hash": "c8392dc67d913d39664e0fc400280a2de03107348f7432e226194f0a7e4eeebe"
}
]
}
```
Expand All @@ -87,4 +98,4 @@ I do accept contributions to this project. create a pull request and describe yo
| Manga | +195,188,170 |
| Models | +167,086,838 |
| XXX | +11,971,422 |
| WGCZ (BangBros) | +11,355,801 |
| WGCZ (BangBros) | +11,355,801 |
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@ beautifulsoup4==4.12.2
selenium==4.14.0
webdriver-manager==4.0.1
requests==2.31.0
colorama==0.4.6
colorama==0.4.6
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

setup(
name="CopyrightArmor",
version="0.2",
version="1.0",
author="Copy05",
description="A tool that scans the web for pirated content",
url="https://github.com/Copy05/CopyrightArmor/",
Expand Down
109 changes: 108 additions & 1 deletion src/ContentMatching.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,119 @@
import requests
import hashlib
import json
import re

from bs4 import BeautifulSoup
from colorama import Style, Fore
from urllib.parse import urljoin
from IO import extract_domain, LoadIgnoreFileExts

def ScanTitle(title, my_content):
with open('patterns.json', 'r') as file:
patterns = json.load(file)['patterns']

anime_pattern = re.compile(patterns['anime_pattern'], re.IGNORECASE)
turkish_pattern = re.compile(patterns['turkish_pattern'], re.IGNORECASE) # Some may include Turkish Language.
pirated_patterns = [re.compile(pattern, re.IGNORECASE) for pattern in patterns['pirated_patterns']]
resolution_pattern = re.compile(patterns['resolution_pattern'], re.IGNORECASE)
episode_pattern = re.compile(patterns['episode_pattern'], re.IGNORECASE)
staffel_pattern = re.compile(patterns['staffel_pattern'], re.IGNORECASE)
legal_sites_pattern = re.compile(patterns['legal_sites_pattern'], re.IGNORECASE)
anime_deep_pattern = re.compile(patterns['anime_deep_pattern'], re.IGNORECASE)
manga_chapter_pattern = re.compile(patterns['manga_chapter_pattern'], re.IGNORECASE)
manga_online_pattern = re.compile(patterns['manga_online_pattern'], re.IGNORECASE)

if my_content not in title:
return False

# Check for the presence of "Watch Anime" in the title
if "Watch Anime" in title:
return True

if anime_pattern.match(title) or turkish_pattern.match(title):
return True
for pattern in pirated_patterns:
if re.match(pattern, title):
return True
if episode_pattern.match(title) or staffel_pattern.match(title):
return True
if manga_chapter_pattern.match(title) or manga_online_pattern.match(title):
return True
if legal_sites_pattern.search(title):
return False
if anime_deep_pattern.match(title):
return True
if resolution_pattern.match(title):
return True
return False

def ScanGoogleLink(url, title, DebugInformation=False, verbose=False):
from GoogleScrape import SearchQuery, infriding_data, UniqueFiles, infringing_urls

titles_list = []
contentFlagged = False

Query = SearchQuery.replace("+", " ").replace('%2C', ',').replace('%20', ' ')

if verbose:
print(f"{Fore.YELLOW}URL: {url}\nTitle: {title}{Style.RESET_ALL}")

with open("hashes.json") as file:
data = json.load(file)

entertainment_data = data.get("entertainment", [])

# For every entry inside the Entertainment data. the Show Name will be appended inside the list.
for entry in entertainment_data:
m_title = entry.get("title", "")
titles_list.append(m_title)

for content in titles_list:

# to differentiate content we want to check if the title of the show is inside the search result title like: "Watch SHOW1" -- "SHOW1"
# like: "Watch SHOW1" -- "SHOW1" so that it get's flagged as "SHOW1"
if ScanTitle(title, content) and content.lower() in title.lower():
if verbose:
print(f"{Fore.RED}COPYRIGHTED MATERIAL FOUND{Style.RESET_ALL}")
contentFlagged = True
else:
if verbose:
print(f"{Fore.GREEN}LEGAL{Style.RESET_ALL}")
contentFlagged = False

# to differentiate content we want to check if the title of the show is inside the search result title
# like: "Watch SHOW1" -- "SHOW1" so that it get's flagged as "SHOW1"
if contentFlagged and content.lower() in title.lower():
for entry in data['entertainment']:

if entry['title'].lower() in content.lower():

original_owner = entry["copyright_owner"]
original_source = entry["original_url"]

# If the URL is the original source
if url == original_source:
continue

infringing_urls.add(url)
UniqueFiles.add(url)

infriding_data.append({
"url": url,
"type": "Copyrighted Show",
"original_url": original_source,
"copyright_owner": original_owner,
"description": entry['title'],
})

if verbose:
print(Fore.RED, f"\nCopyright Infringing Show has been found on {url}.\nSearch Result Title: {title}\nCopyrighted Work: {entry['title']}\nCopyright Owner: {original_owner}\nOriginal Source: {original_source}\n")
else:
print(Fore.RED, f"\nCopyright Infringing Show has been found on {url}.\nCopyrighted Work: {entry['title']}\nCopyright Owner: {original_owner}\nOriginal Source: {original_source}\n")
print(Style.RESET_ALL)

break

def ScanImage(soup : BeautifulSoup, url, DebugInformation : bool):
from Scrape import ScannedImages, infriding_data, infringing_urls, TheBaseURL, UniqueFiles

Expand Down Expand Up @@ -151,4 +258,4 @@ def ScanFiles(soup: BeautifulSoup, url, DebugInformation: bool):
print(Fore.RED, f"\nCopyright Infringing File (\"{link_hash}\") has been found on {url}.\nCopyright Owner: {original_owner}\nOriginal Source: {original_source}\n")
print(Style.RESET_ALL)

break
break
6 changes: 0 additions & 6 deletions src/CopyrightArmor.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,14 +88,8 @@ def PrintVersion():
# To Avoid Long Execution Time when not using the scraping engine.
from GoogleScrape import GoogleScrape

if args.detailed_report and args.report_file is False:
print(Fore.RED, "Error: Invalid Argument: \"--detailed-report\" because \"--report_file\" is false")
print(Style.RESET_ALL)
exit(1)

GoogleScrape(Query=args.google_search, RateLimmit=args.rate_limit, verbose=args.verbose, ReportFile=args.report_file)


if not any(vars(args).values()):
print(Fore.RED, "Error: No arguments provided. Use -h or --help for usage information.")
print(Style.RESET_ALL)
Expand Down
55 changes: 32 additions & 23 deletions src/GoogleScrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,34 +35,49 @@
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from IO import SaveReport
from IO import SaveReport, LoadWhitelist, extract_domain
from verbose_print import PrintFoundLinks
from ContentMatching import ScanGoogleLink
from ScrapingEngine import FilterLinks
from utils import GetSettings

chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--log-level=3')
chrome_options.add_argument('--disable-logging')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--remote-debugging-pipe")

driver = webdriver.Chrome(options=chrome_options)

Found_Links = set()
ScannedImages = set()
UniqueFiles = set()
infringing_urls = set()
infriding_data = []
Index = 1
CookieBannerClicked = False
SearchQuery = None

MORE_RESULTS_BUTTON_XPATHS = ["//*[@id='botstuff']/div/div[3]/div[4]/a[1]/h3/div", "//*[@id='kp-wp-tab-cont-overview']/div/div[2]/div/div/div[4]/a[1]/h3/div", "//*[@id='kp-wp-tab-cont-overview']/div/div[2]/div/div/div[4]/a[2]/h3/span"]
MORE_RESULTS_BUTTON_XPATHS = ["//*[@id='botstuff']/div/div[3]/div[4]/a[1]/h3/div", "//*[@id='kp-wp-tab-cont-overview']/div/div[2]/div/div/div[4]/a[1]/h3/div", "//*[@id='kp-wp-tab-cont-overview']/div/div[2]/div/div/div[4]/a[2]/h3/span", "//*[@id='kp-wp-tab-cont-overview']/div/div[3]/div/div/div[4]/a[1]/h3/div", "//*[@id='botstuff']/div/div[4]/div[4]/a[1]/h3/div", "//*[@id='kp-wp-tab-cont-TvmWatch']/div/div[3]/div/div/div[4]/a[1]/h3/div"]

def GoogleScrape(Query, verbose=False, ReportFile=False, RateLimmit=False, RateLimmitTime=2):

global CookieBannerClicked
global SearchQuery

URL = f"https://google.com/search?q={Query}&cs=0&filter=0&safe=off&nfpr=1"

warnings.filterwarnings("ignore", category=InsecureRequestWarning)

soup = None
MAXIMAL_RETRIES = 5
MAXIMAL_RETRIES = 3
Retries = 0
SearchQuery = Query
whitelist = LoadWhitelist()


if RateLimmit:
time.sleep(RateLimmitTime)
Expand Down Expand Up @@ -91,21 +106,13 @@ def GoogleScrape(Query, verbose=False, ReportFile=False, RateLimmit=False, RateL
for link in soup.find_all('a', href=True):
next_url = urljoin(URL, link['href'])

if next_url.startswith("mailto:"):
if verbose:
print(Fore.YELLOW, f"Skipping {next_url} because 'mailto' links arent allowed")
print(Style.RESET_ALL)
continue
if next_url.startswith("javascript:"):
if verbose:
print(Fore.YELLOW, f"Skipping {next_url} because 'javascript' links arent allowed")
print(Style.RESET_ALL)
if FilterLinks(next_url, verbose, True):
continue

if next_url not in Found_Links and "google.com" not in next_url:
Found_Links.add(next_url)
FoundLinkCount += 1

if verbose:
print(Fore.YELLOW, f"{FoundLinkCount} Links has been added to the List. | {len(Found_Links)} Links in the List")
print(Style.RESET_ALL)
Expand All @@ -123,7 +130,10 @@ def GoogleScrape(Query, verbose=False, ReportFile=False, RateLimmit=False, RateL
Found_Links.add(next_url)
FoundLinkCount += 1
FoundAnyLinks = True


if extract_domain(next_url) not in whitelist:
ScanGoogleLink(url=next_url, title=link.text.strip(), verbose=verbose, DebugInformation=False)

if CookieBannerClicked is False:
try:
cookie_banner = driver.find_element(By.XPATH, "//*[@id='CXQnmb']")
Expand All @@ -136,8 +146,7 @@ def GoogleScrape(Query, verbose=False, ReportFile=False, RateLimmit=False, RateL
pass

try:
print(Fore.GREEN, f"Searching [Links Found: {len(Found_Links)}]")
print(Style.RESET_ALL)
print(f"{Fore.GREEN}Searching [Links Found: {len(Found_Links)}] {Fore.WHITE}| {Fore.RED}Infriding Links Found: {len(infriding_data)}{Style.RESET_ALL}")

if verbose:
print(Fore.YELLOW, f"{FoundLinkCount} Links has been added to the List. | {len(Found_Links)} Links in the List")
Expand Down Expand Up @@ -168,22 +177,22 @@ def GoogleScrape(Query, verbose=False, ReportFile=False, RateLimmit=False, RateL
driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.PAGE_DOWN)
pass

print(f"Query: {Query}\nFound Links: {len(Found_Links)}")
print(f"Query: {Query}\nFound Links: {len(Found_Links)}\n{Fore.RED}Infriding Search Results: {len(infriding_data)}{Style.RESET_ALL}")
if ReportFile:
SaveReport(URL=f"Google_Search_{Query}", content=Found_Links, detailed=False, found_links=Found_Links)
exit()
SaveReport(URL=f"Google_Search_{Query}", content=Found_Links, settings_string=GetSettings(RateLimmit, False, False, False, False), infriding_data=infriding_data, infriding_urls=infringing_urls, scanned_images=ScannedImages)
exit()

except requests.exceptions.TooManyRedirects:
print(Fore.RED, "Overloaded.")
print(Style.RESET_ALL)

if ReportFile:
SaveReport(URL=f"Google_Search_{Query}", content=Found_Links, detailed=False, found_links=Found_Links)
exit()
SaveReport(URL=f"Google_Search_{Query}", content=Found_Links, settings_string=GetSettings(RateLimmit, False, False, False, False), infriding_data=infriding_data, infriding_urls=infringing_urls, scanned_images=ScannedImages)
exit()

except KeyboardInterrupt:
print("Exiting Scrape Mode.")

if ReportFile:
SaveReport(URL=f"Google_Search_{Query}", content=Found_Links, detailed=False, found_links=Found_Links)
exit()
SaveReport(URL=f"Google_Search_{Query}", content=Found_Links, settings_string=GetSettings(RateLimmit, False, False, False, False), infriding_data=infriding_data, infriding_urls=infringing_urls, scanned_images=ScannedImages)
exit()
7 changes: 7 additions & 0 deletions src/IO.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,13 @@ def LoadIgnoreFileExts() -> list[str]:
exts = data["ignore_exts"]
return exts

def LoadWhitelist() -> list[str]:
with open('filters.json', 'r') as file:
data = json.load(file)

wl = data["whitelist"]
return wl

def extract_domain(url):
parsed_url = urlparse(url)
return parsed_url.netloc
Expand Down
3 changes: 3 additions & 0 deletions src/Scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@
chrome_options.add_argument('--log-level=3')
chrome_options.add_argument('--disable-logging')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--remote-debugging-pipe")

driver = webdriver.Chrome(options=chrome_options)

Expand Down
3 changes: 3 additions & 0 deletions src/filters.json
Original file line number Diff line number Diff line change
Expand Up @@ -38,5 +38,8 @@
".editorconfig",
".npmrc",
".dockerignore"
],
"whitelist": [
"https://www.youtube.com/"
]
}
8 changes: 8 additions & 0 deletions src/hashes.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,13 @@
"hash": "81ecf8a6c049c784bd48dc40bcbd3840c7a95d31bd4a82ed40db9610cb639de2",
"description": "Mediakit"
}
],
"entertainment": [
{
"title": "Test123",
"original_url": "https://imdb.com/zuez9zw7ez79z79q",
"copyright_owner": "Example Inc",
"hash": "c8392dc67d913d39664e0fc400280a2de03107348f7432e226194f0a7e43f2be"
}
]
}
Loading

0 comments on commit c52a675

Please sign in to comment.