diff --git a/README.md b/README.md index 51b5b6f..7f344a1 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,7 @@ - **Web Scraping Engine**: CopyrightArmor uses a robust web scraping engine to crawl and analyze websites for potentially infringing content. - **Recursive Scrapping**: Recursively scans websites. - **Google Optimized**: by using `--google` and `--google-search` without `-url` the web scraping engine will be optimized for Google Search. +- **Google SERP Engine**: CopyrightArmor detects most of pirate sites on the Google Search results. ([always check for false positives](https://github.com/Copy05/CopyrightArmor/discussions/4)) - **Flexible**: You can configurate how and what type of links it should scrape: - Exclude Social Media Links - Exclude Query parameter links @@ -48,6 +49,8 @@ pip install -r requirements.txt ``` 4. Open up `src/hashes.json` and add all content that you want to scan for and use this syntax: + +**`--google-search`** scans only for the content inside `"entertainment"` ```json { "images": [ @@ -65,6 +68,14 @@ pip install -r requirements.txt "hash": "3b99f49776f433aeb000fa010e452879198fe4be7f6660552527b53304268343", "description": "Another Pink Cherry Tree Exe" } + ], + "entertainment": [ + { + "title": "EntertainRewind 2024", + "original_url": "example.com", + "copyright_owner": "Entertainmasters", + "hash": "c8392dc67d913d39664e0fc400280a2de03107348f7432e226194f0a7e4eeebe" + } ] } ``` @@ -87,4 +98,4 @@ I do accept contributions to this project. create a pull request and describe yo | Manga | +195,188,170 | | Models | +167,086,838 | | XXX | +11,971,422 | -| WGCZ (BangBros) | +11,355,801 | +| WGCZ (BangBros) | +11,355,801 | \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index e2f87cf..1ba12ba 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,4 @@ beautifulsoup4==4.12.2 selenium==4.14.0 webdriver-manager==4.0.1 requests==2.31.0 -colorama==0.4.6 +colorama==0.4.6 \ No newline at end of file diff --git a/setup.py b/setup.py index 98be41a..a353f82 100644 --- a/setup.py +++ b/setup.py @@ -22,7 +22,7 @@ setup( name="CopyrightArmor", - version="0.2", + version="1.0", author="Copy05", description="A tool that scans the web for pirated content", url="https://github.com/Copy05/CopyrightArmor/", diff --git a/src/ContentMatching.py b/src/ContentMatching.py index 11acf05..075ca38 100644 --- a/src/ContentMatching.py +++ b/src/ContentMatching.py @@ -21,12 +21,119 @@ import requests import hashlib import json +import re from bs4 import BeautifulSoup from colorama import Style, Fore from urllib.parse import urljoin from IO import extract_domain, LoadIgnoreFileExts +def ScanTitle(title, my_content): + with open('patterns.json', 'r') as file: + patterns = json.load(file)['patterns'] + + anime_pattern = re.compile(patterns['anime_pattern'], re.IGNORECASE) + turkish_pattern = re.compile(patterns['turkish_pattern'], re.IGNORECASE) # Some may include Turkish Language. + pirated_patterns = [re.compile(pattern, re.IGNORECASE) for pattern in patterns['pirated_patterns']] + resolution_pattern = re.compile(patterns['resolution_pattern'], re.IGNORECASE) + episode_pattern = re.compile(patterns['episode_pattern'], re.IGNORECASE) + staffel_pattern = re.compile(patterns['staffel_pattern'], re.IGNORECASE) + legal_sites_pattern = re.compile(patterns['legal_sites_pattern'], re.IGNORECASE) + anime_deep_pattern = re.compile(patterns['anime_deep_pattern'], re.IGNORECASE) + manga_chapter_pattern = re.compile(patterns['manga_chapter_pattern'], re.IGNORECASE) + manga_online_pattern = re.compile(patterns['manga_online_pattern'], re.IGNORECASE) + + if my_content not in title: + return False + + # Check for the presence of "Watch Anime" in the title + if "Watch Anime" in title: + return True + + if anime_pattern.match(title) or turkish_pattern.match(title): + return True + for pattern in pirated_patterns: + if re.match(pattern, title): + return True + if episode_pattern.match(title) or staffel_pattern.match(title): + return True + if manga_chapter_pattern.match(title) or manga_online_pattern.match(title): + return True + if legal_sites_pattern.search(title): + return False + if anime_deep_pattern.match(title): + return True + if resolution_pattern.match(title): + return True + return False + +def ScanGoogleLink(url, title, DebugInformation=False, verbose=False): + from GoogleScrape import SearchQuery, infriding_data, UniqueFiles, infringing_urls + + titles_list = [] + contentFlagged = False + + Query = SearchQuery.replace("+", " ").replace('%2C', ',').replace('%20', ' ') + + if verbose: + print(f"{Fore.YELLOW}URL: {url}\nTitle: {title}{Style.RESET_ALL}") + + with open("hashes.json") as file: + data = json.load(file) + + entertainment_data = data.get("entertainment", []) + + # For every entry inside the Entertainment data. the Show Name will be appended inside the list. + for entry in entertainment_data: + m_title = entry.get("title", "") + titles_list.append(m_title) + + for content in titles_list: + + # to differentiate content we want to check if the title of the show is inside the search result title like: "Watch SHOW1" -- "SHOW1" + # like: "Watch SHOW1" -- "SHOW1" so that it get's flagged as "SHOW1" + if ScanTitle(title, content) and content.lower() in title.lower(): + if verbose: + print(f"{Fore.RED}COPYRIGHTED MATERIAL FOUND{Style.RESET_ALL}") + contentFlagged = True + else: + if verbose: + print(f"{Fore.GREEN}LEGAL{Style.RESET_ALL}") + contentFlagged = False + + # to differentiate content we want to check if the title of the show is inside the search result title + # like: "Watch SHOW1" -- "SHOW1" so that it get's flagged as "SHOW1" + if contentFlagged and content.lower() in title.lower(): + for entry in data['entertainment']: + + if entry['title'].lower() in content.lower(): + + original_owner = entry["copyright_owner"] + original_source = entry["original_url"] + + # If the URL is the original source + if url == original_source: + continue + + infringing_urls.add(url) + UniqueFiles.add(url) + + infriding_data.append({ + "url": url, + "type": "Copyrighted Show", + "original_url": original_source, + "copyright_owner": original_owner, + "description": entry['title'], + }) + + if verbose: + print(Fore.RED, f"\nCopyright Infringing Show has been found on {url}.\nSearch Result Title: {title}\nCopyrighted Work: {entry['title']}\nCopyright Owner: {original_owner}\nOriginal Source: {original_source}\n") + else: + print(Fore.RED, f"\nCopyright Infringing Show has been found on {url}.\nCopyrighted Work: {entry['title']}\nCopyright Owner: {original_owner}\nOriginal Source: {original_source}\n") + print(Style.RESET_ALL) + + break + def ScanImage(soup : BeautifulSoup, url, DebugInformation : bool): from Scrape import ScannedImages, infriding_data, infringing_urls, TheBaseURL, UniqueFiles @@ -151,4 +258,4 @@ def ScanFiles(soup: BeautifulSoup, url, DebugInformation: bool): print(Fore.RED, f"\nCopyright Infringing File (\"{link_hash}\") has been found on {url}.\nCopyright Owner: {original_owner}\nOriginal Source: {original_source}\n") print(Style.RESET_ALL) - break + break \ No newline at end of file diff --git a/src/CopyrightArmor.py b/src/CopyrightArmor.py index b6dd416..0c58bca 100644 --- a/src/CopyrightArmor.py +++ b/src/CopyrightArmor.py @@ -88,14 +88,8 @@ def PrintVersion(): # To Avoid Long Execution Time when not using the scraping engine. from GoogleScrape import GoogleScrape - if args.detailed_report and args.report_file is False: - print(Fore.RED, "Error: Invalid Argument: \"--detailed-report\" because \"--report_file\" is false") - print(Style.RESET_ALL) - exit(1) - GoogleScrape(Query=args.google_search, RateLimmit=args.rate_limit, verbose=args.verbose, ReportFile=args.report_file) - if not any(vars(args).values()): print(Fore.RED, "Error: No arguments provided. Use -h or --help for usage information.") print(Style.RESET_ALL) diff --git a/src/GoogleScrape.py b/src/GoogleScrape.py index 948833a..4489adb 100644 --- a/src/GoogleScrape.py +++ b/src/GoogleScrape.py @@ -35,34 +35,49 @@ from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC -from IO import SaveReport +from IO import SaveReport, LoadWhitelist, extract_domain from verbose_print import PrintFoundLinks +from ContentMatching import ScanGoogleLink +from ScrapingEngine import FilterLinks +from utils import GetSettings chrome_options = Options() chrome_options.add_argument('--headless') chrome_options.add_argument('--log-level=3') chrome_options.add_argument('--disable-logging') chrome_options.add_argument('--disable-dev-shm-usage') +chrome_options.add_argument("--disable-gpu") +chrome_options.add_argument("--disable-extensions") +chrome_options.add_argument("--remote-debugging-pipe") driver = webdriver.Chrome(options=chrome_options) Found_Links = set() +ScannedImages = set() +UniqueFiles = set() +infringing_urls = set() +infriding_data = [] Index = 1 CookieBannerClicked = False +SearchQuery = None -MORE_RESULTS_BUTTON_XPATHS = ["//*[@id='botstuff']/div/div[3]/div[4]/a[1]/h3/div", "//*[@id='kp-wp-tab-cont-overview']/div/div[2]/div/div/div[4]/a[1]/h3/div", "//*[@id='kp-wp-tab-cont-overview']/div/div[2]/div/div/div[4]/a[2]/h3/span"] +MORE_RESULTS_BUTTON_XPATHS = ["//*[@id='botstuff']/div/div[3]/div[4]/a[1]/h3/div", "//*[@id='kp-wp-tab-cont-overview']/div/div[2]/div/div/div[4]/a[1]/h3/div", "//*[@id='kp-wp-tab-cont-overview']/div/div[2]/div/div/div[4]/a[2]/h3/span", "//*[@id='kp-wp-tab-cont-overview']/div/div[3]/div/div/div[4]/a[1]/h3/div", "//*[@id='botstuff']/div/div[4]/div[4]/a[1]/h3/div", "//*[@id='kp-wp-tab-cont-TvmWatch']/div/div[3]/div/div/div[4]/a[1]/h3/div"] def GoogleScrape(Query, verbose=False, ReportFile=False, RateLimmit=False, RateLimmitTime=2): global CookieBannerClicked + global SearchQuery URL = f"https://google.com/search?q={Query}&cs=0&filter=0&safe=off&nfpr=1" warnings.filterwarnings("ignore", category=InsecureRequestWarning) soup = None - MAXIMAL_RETRIES = 5 + MAXIMAL_RETRIES = 3 Retries = 0 + SearchQuery = Query + whitelist = LoadWhitelist() + if RateLimmit: time.sleep(RateLimmitTime) @@ -91,21 +106,13 @@ def GoogleScrape(Query, verbose=False, ReportFile=False, RateLimmit=False, RateL for link in soup.find_all('a', href=True): next_url = urljoin(URL, link['href']) - if next_url.startswith("mailto:"): - if verbose: - print(Fore.YELLOW, f"Skipping {next_url} because 'mailto' links arent allowed") - print(Style.RESET_ALL) - continue - if next_url.startswith("javascript:"): - if verbose: - print(Fore.YELLOW, f"Skipping {next_url} because 'javascript' links arent allowed") - print(Style.RESET_ALL) + if FilterLinks(next_url, verbose, True): continue if next_url not in Found_Links and "google.com" not in next_url: Found_Links.add(next_url) FoundLinkCount += 1 - + if verbose: print(Fore.YELLOW, f"{FoundLinkCount} Links has been added to the List. | {len(Found_Links)} Links in the List") print(Style.RESET_ALL) @@ -123,7 +130,10 @@ def GoogleScrape(Query, verbose=False, ReportFile=False, RateLimmit=False, RateL Found_Links.add(next_url) FoundLinkCount += 1 FoundAnyLinks = True - + + if extract_domain(next_url) not in whitelist: + ScanGoogleLink(url=next_url, title=link.text.strip(), verbose=verbose, DebugInformation=False) + if CookieBannerClicked is False: try: cookie_banner = driver.find_element(By.XPATH, "//*[@id='CXQnmb']") @@ -136,8 +146,7 @@ def GoogleScrape(Query, verbose=False, ReportFile=False, RateLimmit=False, RateL pass try: - print(Fore.GREEN, f"Searching [Links Found: {len(Found_Links)}]") - print(Style.RESET_ALL) + print(f"{Fore.GREEN}Searching [Links Found: {len(Found_Links)}] {Fore.WHITE}| {Fore.RED}Infriding Links Found: {len(infriding_data)}{Style.RESET_ALL}") if verbose: print(Fore.YELLOW, f"{FoundLinkCount} Links has been added to the List. | {len(Found_Links)} Links in the List") @@ -168,22 +177,22 @@ def GoogleScrape(Query, verbose=False, ReportFile=False, RateLimmit=False, RateL driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.PAGE_DOWN) pass - print(f"Query: {Query}\nFound Links: {len(Found_Links)}") + print(f"Query: {Query}\nFound Links: {len(Found_Links)}\n{Fore.RED}Infriding Search Results: {len(infriding_data)}{Style.RESET_ALL}") if ReportFile: - SaveReport(URL=f"Google_Search_{Query}", content=Found_Links, detailed=False, found_links=Found_Links) - exit() + SaveReport(URL=f"Google_Search_{Query}", content=Found_Links, settings_string=GetSettings(RateLimmit, False, False, False, False), infriding_data=infriding_data, infriding_urls=infringing_urls, scanned_images=ScannedImages) + exit() except requests.exceptions.TooManyRedirects: print(Fore.RED, "Overloaded.") print(Style.RESET_ALL) if ReportFile: - SaveReport(URL=f"Google_Search_{Query}", content=Found_Links, detailed=False, found_links=Found_Links) - exit() + SaveReport(URL=f"Google_Search_{Query}", content=Found_Links, settings_string=GetSettings(RateLimmit, False, False, False, False), infriding_data=infriding_data, infriding_urls=infringing_urls, scanned_images=ScannedImages) + exit() except KeyboardInterrupt: print("Exiting Scrape Mode.") if ReportFile: - SaveReport(URL=f"Google_Search_{Query}", content=Found_Links, detailed=False, found_links=Found_Links) - exit() \ No newline at end of file + SaveReport(URL=f"Google_Search_{Query}", content=Found_Links, settings_string=GetSettings(RateLimmit, False, False, False, False), infriding_data=infriding_data, infriding_urls=infringing_urls, scanned_images=ScannedImages) + exit() \ No newline at end of file diff --git a/src/IO.py b/src/IO.py index 1b68dfe..dad8c5f 100644 --- a/src/IO.py +++ b/src/IO.py @@ -40,6 +40,13 @@ def LoadIgnoreFileExts() -> list[str]: exts = data["ignore_exts"] return exts +def LoadWhitelist() -> list[str]: + with open('filters.json', 'r') as file: + data = json.load(file) + + wl = data["whitelist"] + return wl + def extract_domain(url): parsed_url = urlparse(url) return parsed_url.netloc diff --git a/src/Scrape.py b/src/Scrape.py index a3a611c..62d24bf 100644 --- a/src/Scrape.py +++ b/src/Scrape.py @@ -46,6 +46,9 @@ chrome_options.add_argument('--log-level=3') chrome_options.add_argument('--disable-logging') chrome_options.add_argument('--disable-dev-shm-usage') +chrome_options.add_argument("--disable-gpu") +chrome_options.add_argument("--disable-extensions") +chrome_options.add_argument("--remote-debugging-pipe") driver = webdriver.Chrome(options=chrome_options) diff --git a/src/filters.json b/src/filters.json index 9dd2a31..ebeee3d 100644 --- a/src/filters.json +++ b/src/filters.json @@ -38,5 +38,8 @@ ".editorconfig", ".npmrc", ".dockerignore" + ], + "whitelist": [ + "https://www.youtube.com/" ] } \ No newline at end of file diff --git a/src/hashes.json b/src/hashes.json index 81d7edd..52bdb65 100644 --- a/src/hashes.json +++ b/src/hashes.json @@ -14,5 +14,13 @@ "hash": "81ecf8a6c049c784bd48dc40bcbd3840c7a95d31bd4a82ed40db9610cb639de2", "description": "Mediakit" } + ], + "entertainment": [ + { + "title": "Test123", + "original_url": "https://imdb.com/zuez9zw7ez79z79q", + "copyright_owner": "Example Inc", + "hash": "c8392dc67d913d39664e0fc400280a2de03107348f7432e226194f0a7e43f2be" + } ] } \ No newline at end of file diff --git a/src/patterns.json b/src/patterns.json new file mode 100644 index 0000000..eca34fd --- /dev/null +++ b/src/patterns.json @@ -0,0 +1,27 @@ +{ + "patterns": { + "anime_pattern": "(.*)(Episode\\s(0|[1-9]\\d{0,2}|1\\d{3}|2000)\\s(?:English|Deutsch)\\sSub)(.*)", + "turkish_pattern": "(.*)(\\d+\\.\\sSezon\\s\\d+\\.\\sBölüm)(.*)", + "pirated_patterns": [ + "(.*)(Episode\\s\\d+\\s(?:English|Deutsch))(.*)", + "(.*)(?:Hindi\\sDubbed|Dubbed|VietSub|Deutsche\\sUntertiteln|DUB|SUB)(.*)", + "(.*)(Multi\\sAudio)(.*)", + "(.*)(VOSTFR)(.*)", + "(.*)(Stream)(?!ing)(.*)", + "(.*)(Watch\\sNow)(.*)", + "(.*)(For\\sFree)(.*)", + "\\b(?:Anime\\s*)?\\bHigh\\s*Quality\\b.*\\bFree\\b", + "\\b(?:Lordfilm|Лордфильм|Lordserial|Aniwave|9Anime|Lordseria|hianime|zorox|4anime|kissanime|anix|aniworld|9anime|gogoanime)\\b", + "\\bRead .{4,} Online Free\\b", + "(.*)\\s\\(Official Simulpub\\)", + "\\bWatch .{3,} Season .{1,} Online Free\\b" + ], + "resolution_pattern": "(.*)(?:Watch)\\s(.*)\\s(?:inHD|in\\s1080p|in\\s1440p|in\\s4K|in\\s2160p|in\\s8K)?\\s?(?:Online)?(?:For\\sFree)?", + "episode_pattern": "(.*)(Episode\\s\\d+)(.*)", + "staffel_pattern": "(.*)(Staffel\\s\\d+)(.*)", + "legal_sites_pattern": "(Crunchyroll|Funimation|Netflix|Hulu|Amazon\\sPrime|Disney\\s\\+|Youtube|Twitter|Facebook|Imdb|Yahoo)", + "anime_deep_pattern": "^(?:^|.*\\b)(Stream|Watch)\\s+online(?:\\s+free)?(?:\\b.*)?$", + "manga_chapter_pattern": "(.*),\\sChapter\\s\\d+", + "manga_online_pattern": "(.*)(Manga\\s.*)\\sOnline\\s-\\s\\[ENGLISH\\]" + } +} \ No newline at end of file