CopyrightArmor release v1

Copy05 · Apr 29, 2024 · c52a675 · c52a675
1 parent 359d7d7
commit c52a675
Show file tree

Hide file tree

Showing 11 changed files with 202 additions and 33 deletions.
diff --git a/README.md b/README.md
@@ -9,6 +9,7 @@
 - **Web Scraping Engine**: CopyrightArmor uses a robust web scraping engine to crawl and analyze websites for potentially infringing content.
 - **Recursive Scrapping**: Recursively scans websites.
 - **Google Optimized**: by using `--google` and `--google-search` without `-url` the web scraping engine will be optimized for Google Search.
+- **Google SERP Engine**: CopyrightArmor detects most of pirate sites on the Google Search results. ([always check for false positives](https://github.com/Copy05/CopyrightArmor/discussions/4))
 - **Flexible**: You can configurate how and what type of links it should scrape:
     - Exclude Social Media Links
     - Exclude Query parameter links
@@ -48,6 +49,8 @@ pip install -r requirements.txt
 ```
 
 4. Open up `src/hashes.json` and add all content that you want to scan for and use this syntax:
+
+**`--google-search`** scans only for the content inside `"entertainment"`
 ```json
 {
     "images": [
@@ -65,6 +68,14 @@ pip install -r requirements.txt
             "hash": "3b99f49776f433aeb000fa010e452879198fe4be7f6660552527b53304268343",
             "description": "Another Pink Cherry Tree Exe"
         }
+    ],
+    "entertainment": [
+        {
+            "title": "EntertainRewind 2024",
+            "original_url": "example.com",
+            "copyright_owner": "Entertainmasters",
+            "hash": "c8392dc67d913d39664e0fc400280a2de03107348f7432e226194f0a7e4eeebe"
+        }
     ]
 }
 ```
@@ -87,4 +98,4 @@ I do accept contributions to this project. create a pull request and describe yo
 | Manga                 | +195,188,170      |
 | Models                | +167,086,838      |
 | XXX                   | +11,971,422       |
-| WGCZ (BangBros)       | +11,355,801       |
+| WGCZ (BangBros)       | +11,355,801       |
diff --git a/requirements.txt b/requirements.txt
@@ -2,4 +2,4 @@ beautifulsoup4==4.12.2
 selenium==4.14.0
 webdriver-manager==4.0.1
 requests==2.31.0
-colorama==0.4.6
+colorama==0.4.6
diff --git a/setup.py b/setup.py
@@ -22,7 +22,7 @@
 
 setup(
     name="CopyrightArmor",
-    version="0.2",
+    version="1.0",
     author="Copy05",
     description="A tool that scans the web for pirated content",
     url="https://github.com/Copy05/CopyrightArmor/",

diff --git a/src/ContentMatching.py b/src/ContentMatching.py
@@ -21,12 +21,119 @@
 import requests
 import hashlib
 import json
+import re
 
 from bs4 import BeautifulSoup
 from colorama import Style, Fore
 from urllib.parse import urljoin
 from IO import extract_domain, LoadIgnoreFileExts
 
+def ScanTitle(title, my_content):
+    with open('patterns.json', 'r') as file:
+        patterns = json.load(file)['patterns']
+
+    anime_pattern = re.compile(patterns['anime_pattern'], re.IGNORECASE)
+    turkish_pattern = re.compile(patterns['turkish_pattern'], re.IGNORECASE) # Some may include Turkish Language.
+    pirated_patterns = [re.compile(pattern, re.IGNORECASE) for pattern in patterns['pirated_patterns']]
+    resolution_pattern = re.compile(patterns['resolution_pattern'], re.IGNORECASE)
+    episode_pattern = re.compile(patterns['episode_pattern'], re.IGNORECASE)
+    staffel_pattern = re.compile(patterns['staffel_pattern'], re.IGNORECASE)
+    legal_sites_pattern = re.compile(patterns['legal_sites_pattern'], re.IGNORECASE)
+    anime_deep_pattern = re.compile(patterns['anime_deep_pattern'], re.IGNORECASE)
+    manga_chapter_pattern = re.compile(patterns['manga_chapter_pattern'], re.IGNORECASE)
+    manga_online_pattern = re.compile(patterns['manga_online_pattern'], re.IGNORECASE)
+
+    if my_content not in title:
+        return False
+
+    # Check for the presence of "Watch Anime" in the title
+    if "Watch Anime" in title:
+        return True
+
+    if anime_pattern.match(title) or turkish_pattern.match(title):
+        return True
+    for pattern in pirated_patterns:
+        if re.match(pattern, title):
+            return True
+    if episode_pattern.match(title) or staffel_pattern.match(title):
+        return True
+    if manga_chapter_pattern.match(title) or manga_online_pattern.match(title):
+        return True
+    if legal_sites_pattern.search(title):
+        return False
+    if anime_deep_pattern.match(title):
+        return True
+    if resolution_pattern.match(title):
+        return True
+    return False
+
+def ScanGoogleLink(url, title, DebugInformation=False, verbose=False):
+    from GoogleScrape import SearchQuery, infriding_data, UniqueFiles, infringing_urls
+
+    titles_list = []
+    contentFlagged = False
+
+    Query = SearchQuery.replace("+", " ").replace('%2C', ',').replace('%20', ' ')
+
+    if verbose:
+        print(f"{Fore.YELLOW}URL: {url}\nTitle: {title}{Style.RESET_ALL}")
+
+    with open("hashes.json") as file:
+        data = json.load(file)
+
+    entertainment_data = data.get("entertainment", [])
+
+    # For every entry inside the Entertainment data. the Show Name will be appended inside the list.
+    for entry in entertainment_data:
+        m_title = entry.get("title", "")
+        titles_list.append(m_title)
+
+    for content in titles_list:
+
+        # to differentiate content we want to check if the title of the show is inside the search result title like: "Watch SHOW1" -- "SHOW1"
+        # like: "Watch SHOW1" -- "SHOW1" so that it get's flagged as "SHOW1"
+        if ScanTitle(title, content) and content.lower() in title.lower():
+            if verbose:
+                print(f"{Fore.RED}COPYRIGHTED MATERIAL FOUND{Style.RESET_ALL}")
+            contentFlagged = True
+        else:
+            if verbose:
+                print(f"{Fore.GREEN}LEGAL{Style.RESET_ALL}")
+            contentFlagged = False
+
+        # to differentiate content we want to check if the title of the show is inside the search result title 
+        # like: "Watch SHOW1" -- "SHOW1" so that it get's flagged as "SHOW1"
+        if contentFlagged and content.lower() in title.lower():
+            for entry in data['entertainment']:
+
+                if entry['title'].lower() in content.lower():
+
+                    original_owner = entry["copyright_owner"]
+                    original_source = entry["original_url"]
+
+                    # If the URL is the original source
+                    if url == original_source:
+                        continue
+
+                    infringing_urls.add(url)
+                    UniqueFiles.add(url)
+
+                    infriding_data.append({
+                        "url": url,
+                        "type": "Copyrighted Show",
+                        "original_url": original_source,
+                        "copyright_owner": original_owner,
+                        "description": entry['title'],
+                    })
+
+                    if verbose:
+                        print(Fore.RED, f"\nCopyright Infringing Show has been found on {url}.\nSearch Result Title: {title}\nCopyrighted Work: {entry['title']}\nCopyright Owner: {original_owner}\nOriginal Source: {original_source}\n")
+                    else:
+                        print(Fore.RED, f"\nCopyright Infringing Show has been found on {url}.\nCopyrighted Work: {entry['title']}\nCopyright Owner: {original_owner}\nOriginal Source: {original_source}\n")   
+                    print(Style.RESET_ALL)
+
+                    break
+
 def ScanImage(soup : BeautifulSoup, url, DebugInformation : bool):
     from Scrape import ScannedImages, infriding_data, infringing_urls, TheBaseURL, UniqueFiles
 
@@ -151,4 +258,4 @@ def ScanFiles(soup: BeautifulSoup, url, DebugInformation: bool):
                         print(Fore.RED, f"\nCopyright Infringing File (\"{link_hash}\") has been found on {url}.\nCopyright Owner: {original_owner}\nOriginal Source: {original_source}\n")
                         print(Style.RESET_ALL)
 
-                        break
+                        break
diff --git a/src/CopyrightArmor.py b/src/CopyrightArmor.py
@@ -88,14 +88,8 @@ def PrintVersion():
         # To Avoid Long Execution Time when not using the scraping engine.
         from GoogleScrape import GoogleScrape
 
-        if args.detailed_report and args.report_file is False:
-            print(Fore.RED, "Error: Invalid Argument: \"--detailed-report\" because \"--report_file\" is false")
-            print(Style.RESET_ALL)
-            exit(1)
-
         GoogleScrape(Query=args.google_search, RateLimmit=args.rate_limit, verbose=args.verbose, ReportFile=args.report_file)
 
-
     if not any(vars(args).values()):
         print(Fore.RED, "Error: No arguments provided. Use -h or --help for usage information.")
         print(Style.RESET_ALL)

diff --git a/src/GoogleScrape.py b/src/GoogleScrape.py
@@ -35,34 +35,49 @@
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 
-from IO import SaveReport
+from IO import SaveReport, LoadWhitelist, extract_domain
 from verbose_print import PrintFoundLinks
+from ContentMatching import ScanGoogleLink
+from ScrapingEngine import FilterLinks
+from utils import GetSettings
 
 chrome_options = Options()
 chrome_options.add_argument('--headless')
 chrome_options.add_argument('--log-level=3')
 chrome_options.add_argument('--disable-logging')
 chrome_options.add_argument('--disable-dev-shm-usage')
+chrome_options.add_argument("--disable-gpu")
+chrome_options.add_argument("--disable-extensions")
+chrome_options.add_argument("--remote-debugging-pipe")
 
 driver = webdriver.Chrome(options=chrome_options)
 
 Found_Links = set()
+ScannedImages = set()
+UniqueFiles = set()
+infringing_urls = set()
+infriding_data = []
 Index = 1
 CookieBannerClicked = False
+SearchQuery = None
 
-MORE_RESULTS_BUTTON_XPATHS = ["//*[@id='botstuff']/div/div[3]/div[4]/a[1]/h3/div", "//*[@id='kp-wp-tab-cont-overview']/div/div[2]/div/div/div[4]/a[1]/h3/div", "//*[@id='kp-wp-tab-cont-overview']/div/div[2]/div/div/div[4]/a[2]/h3/span"]
+MORE_RESULTS_BUTTON_XPATHS = ["//*[@id='botstuff']/div/div[3]/div[4]/a[1]/h3/div", "//*[@id='kp-wp-tab-cont-overview']/div/div[2]/div/div/div[4]/a[1]/h3/div", "//*[@id='kp-wp-tab-cont-overview']/div/div[2]/div/div/div[4]/a[2]/h3/span", "//*[@id='kp-wp-tab-cont-overview']/div/div[3]/div/div/div[4]/a[1]/h3/div", "//*[@id='botstuff']/div/div[4]/div[4]/a[1]/h3/div", "//*[@id='kp-wp-tab-cont-TvmWatch']/div/div[3]/div/div/div[4]/a[1]/h3/div"]
 
 def GoogleScrape(Query, verbose=False, ReportFile=False, RateLimmit=False, RateLimmitTime=2):
 
     global CookieBannerClicked
+    global SearchQuery
 
     URL = f"https://google.com/search?q={Query}&cs=0&filter=0&safe=off&nfpr=1"
 
     warnings.filterwarnings("ignore", category=InsecureRequestWarning)
 
     soup = None
-    MAXIMAL_RETRIES = 5
+    MAXIMAL_RETRIES = 3
     Retries = 0
+    SearchQuery = Query
+    whitelist = LoadWhitelist()
+
 
     if RateLimmit:
         time.sleep(RateLimmitTime)
@@ -91,21 +106,13 @@ def GoogleScrape(Query, verbose=False, ReportFile=False, RateLimmit=False, RateL
             for link in soup.find_all('a', href=True):
                 next_url = urljoin(URL, link['href'])
 
-                if next_url.startswith("mailto:"):
-                    if verbose:
-                        print(Fore.YELLOW, f"Skipping {next_url} because 'mailto' links arent allowed")
-                        print(Style.RESET_ALL)
-                    continue
-                if next_url.startswith("javascript:"):
-                    if verbose:
-                        print(Fore.YELLOW, f"Skipping {next_url} because 'javascript' links arent allowed")
-                        print(Style.RESET_ALL)
+                if FilterLinks(next_url, verbose, True):
                     continue
 
                 if next_url not in Found_Links and "google.com" not in next_url:
                     Found_Links.add(next_url)
                     FoundLinkCount += 1
-
+                
             if verbose:
                 print(Fore.YELLOW, f"{FoundLinkCount} Links has been added to the List. | {len(Found_Links)} Links in the List")
                 print(Style.RESET_ALL)
@@ -123,7 +130,10 @@ def GoogleScrape(Query, verbose=False, ReportFile=False, RateLimmit=False, RateL
                         Found_Links.add(next_url)
                         FoundLinkCount += 1
                         FoundAnyLinks = True
-
+
+                        if extract_domain(next_url) not in whitelist:
+                            ScanGoogleLink(url=next_url, title=link.text.strip(), verbose=verbose, DebugInformation=False)
+
                 if CookieBannerClicked is False:
                     try:
                         cookie_banner = driver.find_element(By.XPATH, "//*[@id='CXQnmb']")
@@ -136,8 +146,7 @@ def GoogleScrape(Query, verbose=False, ReportFile=False, RateLimmit=False, RateL
                         pass
 
                 try:
-                    print(Fore.GREEN, f"Searching [Links Found: {len(Found_Links)}]")
-                    print(Style.RESET_ALL)
+                    print(f"{Fore.GREEN}Searching [Links Found: {len(Found_Links)}] {Fore.WHITE}| {Fore.RED}Infriding Links Found: {len(infriding_data)}{Style.RESET_ALL}")
 
                     if verbose:
                         print(Fore.YELLOW, f"{FoundLinkCount} Links has been added to the List. | {len(Found_Links)} Links in the List")
@@ -168,22 +177,22 @@ def GoogleScrape(Query, verbose=False, ReportFile=False, RateLimmit=False, RateL
                         driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.PAGE_DOWN)
                     pass
 
-            print(f"Query: {Query}\nFound Links: {len(Found_Links)}")
+            print(f"Query: {Query}\nFound Links: {len(Found_Links)}\n{Fore.RED}Infriding Search Results: {len(infriding_data)}{Style.RESET_ALL}")
             if ReportFile:
-                SaveReport(URL=f"Google_Search_{Query}", content=Found_Links, detailed=False, found_links=Found_Links)
-                exit()
+                SaveReport(URL=f"Google_Search_{Query}", content=Found_Links, settings_string=GetSettings(RateLimmit, False, False, False, False), infriding_data=infriding_data, infriding_urls=infringing_urls, scanned_images=ScannedImages)
+            exit()
 
     except requests.exceptions.TooManyRedirects:
         print(Fore.RED, "Overloaded.")
         print(Style.RESET_ALL)
 
         if ReportFile:
-            SaveReport(URL=f"Google_Search_{Query}", content=Found_Links, detailed=False, found_links=Found_Links)
-            exit()
+            SaveReport(URL=f"Google_Search_{Query}", content=Found_Links, settings_string=GetSettings(RateLimmit, False, False, False, False), infriding_data=infriding_data, infriding_urls=infringing_urls, scanned_images=ScannedImages)
+        exit()
 
     except KeyboardInterrupt:
         print("Exiting Scrape Mode.")
 
         if ReportFile:
-            SaveReport(URL=f"Google_Search_{Query}", content=Found_Links, detailed=False, found_links=Found_Links)
-            exit()
+            SaveReport(URL=f"Google_Search_{Query}", content=Found_Links, settings_string=GetSettings(RateLimmit, False, False, False, False), infriding_data=infriding_data, infriding_urls=infringing_urls, scanned_images=ScannedImages)
+        exit()
diff --git a/src/IO.py b/src/IO.py
@@ -40,6 +40,13 @@ def LoadIgnoreFileExts() -> list[str]:
     exts = data["ignore_exts"]
     return exts
 
+def LoadWhitelist() -> list[str]:
+    with open('filters.json', 'r') as file:
+        data = json.load(file)
+
+    wl = data["whitelist"]
+    return wl
+
 def extract_domain(url):
     parsed_url = urlparse(url)
     return parsed_url.netloc

diff --git a/src/Scrape.py b/src/Scrape.py
@@ -46,6 +46,9 @@
 chrome_options.add_argument('--log-level=3')
 chrome_options.add_argument('--disable-logging')
 chrome_options.add_argument('--disable-dev-shm-usage')
+chrome_options.add_argument("--disable-gpu")
+chrome_options.add_argument("--disable-extensions")
+chrome_options.add_argument("--remote-debugging-pipe")
 
 driver = webdriver.Chrome(options=chrome_options)
 

diff --git a/src/filters.json b/src/filters.json
@@ -38,5 +38,8 @@
         ".editorconfig",
         ".npmrc",
         ".dockerignore"
+    ],
+    "whitelist": [
+        "https://www.youtube.com/"
     ]
 }
diff --git a/src/hashes.json b/src/hashes.json
@@ -14,5 +14,13 @@
             "hash": "81ecf8a6c049c784bd48dc40bcbd3840c7a95d31bd4a82ed40db9610cb639de2",
             "description": "Mediakit"
         }
+    ],
+    "entertainment": [
+        {
+            "title": "Test123",
+            "original_url": "https://imdb.com/zuez9zw7ez79z79q",
+            "copyright_owner": "Example Inc",
+            "hash": "c8392dc67d913d39664e0fc400280a2de03107348f7432e226194f0a7e43f2be"
+        }
     ]
 }