Skip to content

Commit

Permalink
Chromedriver patch (#23)
Browse files Browse the repository at this point in the history
* Update __init__.py

* Update setup.py

* Update __init__.py

* Update __init__.py

* Update __init__.py

* Update __init__.py

* Update __init__.py

* Update __init__.py

* Update __init__.py

* Update self_update.py

* Update __init__.py

* Update __init__.py

* Update __init__.py

* Update self_update.py

* Update __init__.py

* Update __init__.py
  • Loading branch information
MathiasExorde authored Jun 22, 2023
1 parent 376be8c commit 3bdd382
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 40 deletions.
7 changes: 5 additions & 2 deletions data/exorde_data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,14 @@ async def get_scraping_module(module_name):
module_hash = scraping_modules[module_name]
try:
old_module_version = metadata.version(module_hash)
logging.info(f"Scraping module [{module_name}] Version check")
logging.info(f"Current version = {old_module_version}")
online_module_version = await get_module_online_version(module_name)
if old_module_version != online_module_version:
logging.info(f"Latest version = {online_module_version}")
if old_module_version < online_module_version:
logging.info(f"Updating {module_name}")
logging.info(
"diff in versions : {module_version} != {online_module_version}"
f"diff in versions : {module_version} != {online_module_version}"
)
repository_path = f"git+https://github.com/exorde-labs/exorde-client.git#subdirectory=data/scraping/{module_name}&egg={module_hash}"

Expand Down
82 changes: 45 additions & 37 deletions data/scraping/twitter/a7df32de3a60dfdb7a0b/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,59 +353,65 @@ def get_data(card):
"Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36",
]


def get_chrome_path():
if os.path.isfile('/usr/bin/chromium-browser'):
return '/usr/bin/chromium-browser'
elif os.path.isfile('/usr/bin/chromium'):
return '/usr/bin/chromium'
elif os.path.isfile('/usr/bin/chrome'):
return '/usr/bin/chrome'
elif os.path.isfile('/usr/bin/google-chrome'):
return '/usr/bin/google-chrome'
else:
return None

def init_driver(headless=True, proxy=None, show_images=False, option=None, firefox=False, env="/.env"):
""" initiate a chromedriver or firefoxdriver instance
--option : other option to add (str)
"""
global driver
http_proxy = get_proxy(env)
if firefox:
# options = FirefoxOptions()
# driver_path = geckodriver_autoinstaller.install()
logging.info("Firefox: Geckodriver disabled")
else:
options = ChromeOptions()
driver_path = chromedriver_autoinstaller.install()
logging.info("Add options to Chrome Driver")
options.add_argument("--disable-blink-features") # Disable features that might betray automation
options.add_argument("--disable-blink-features=AutomationControlled") # Disables a Chrome flag that shows an 'automation' toolbar
options.add_experimental_option("excludeSwitches", ["enable-automation"]) # Disable automation flags
options.add_experimental_option('useAutomationExtension', False) # Disable automation extensions
options.add_argument("--headless") # Ensure GUI is off. Essential for Docker.
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("disable-infobars")
options.add_argument(f'user-agent={random.choice(user_agents)}')
# add proxy if available
if http_proxy is not None:
logging.info("[options] Adding a HTTP Proxy server to ChromeDriver: %s", http_proxy)
options.add_argument('--proxy-server=%s' % http_proxy)

driver = webdriver.Chrome(options=options)
options = ChromeOptions()
# driver_path = chromedriver_autoinstaller.install()
logging.info("Adding options to Chromium Driver")
binary_path = get_chrome_path()
options.binary_location = binary_path
logging.info(f"\tSelected Chrome executable path = {binary_path}")
options.add_argument("--no-sandbox")
options.add_argument("--disable-blink-features") # Disable features that might betray automation
options.add_argument("--disable-blink-features=AutomationControlled") # Disables a Chrome flag that shows an 'automation' toolbar
options.add_experimental_option("excludeSwitches", ["enable-automation"]) # Disable automation flags
options.add_experimental_option('useAutomationExtension', False) # Disable automation extensions
logging.info("\tDisable automation extensions & flags")
options.add_argument("--headless") # Ensure GUI is off. Essential for Docker.
logging.info("\tHeadless")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("disable-infobars")
selected_user_agent = random.choice(user_agents)
options.add_argument(f'user-agent={selected_user_agent}')
logging.info("\tselected_user_agent : %s", selected_user_agent)

# add proxy if available
if http_proxy is not None and len(http_proxy)>6:
logging.info("\tAdding a HTTP Proxy server: %s", http_proxy)
options.add_argument('--proxy-server=%s' % http_proxy)
if headless is True:
logging.info("Scraping on headless mode.")
options.add_argument('--disable-gpu')
options.headless = True
else:
options.headless = False
options.add_argument('log-level=3')
if proxy is not None:
options.add_argument('--proxy-server=%s' % proxy)
logging.info("using proxy : %s", proxy)
if show_images == False and firefox == False:
prefs = {"profile.managed_default_content_settings.images": 2}
options.add_experimental_option("prefs", prefs)
if option is not None:
options.add_argument(option)

if firefox:
driver = webdriver.Firefox(options=options, executable_path=driver_path)
else:
driver = webdriver.Chrome(options=options, executable_path=driver_path)
driver_path = '/usr/local/bin/chromedriver'
logging.info(f"Opening driver from path = {driver_path}")
driver = webdriver.Chrome(options=options, executable_path=driver_path)

driver.set_page_load_timeout(10)
driver.set_page_load_timeout(7)
return driver


Expand Down Expand Up @@ -480,7 +486,10 @@ def log_in(env="/.env", wait=4):
target_broad = 'twitter.com/home'
try:
# Load cookies if they exist
cookies = pickle.load(open("cookies.pkl", "rb"))
try:
cookies = pickle.load(open("cookies.pkl", "rb"))
except:

logging.info("[Twitter Chrome] loading existing cookies... ")
for cookie in cookies:
logging.info("\t-%s",cookie)
Expand Down Expand Up @@ -794,7 +803,7 @@ async def query(url: str) -> AsyncGenerator[Item, None]:
# Selenium track A: login based
try:
# Usage
check_and_kill_processes(["chromedriver", "google-chrome"])
check_and_kill_processes(["chromium","chromedriver", "google-chrome"])
try:
logging.info("[Twitter] Open driver")
driver = init_driver(headless=True, show_images=False, proxy=None)
Expand All @@ -805,7 +814,6 @@ async def query(url: str) -> AsyncGenerator[Item, None]:
except Exception as e:
logging.debug("Exception during Twitter Init: %s",e)

chromedriver_autoinstaller.install()
try:
nb_tweets_wanted = 50
async for result in scrape_( keyword=search_keyword, display_type="latest", limit=nb_tweets_wanted):
Expand Down
2 changes: 1 addition & 1 deletion data/scraping/twitter/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

setup(
name="a7df32de3a60dfdb7a0b",
version="0.0.2",
version="0.0.3",
packages=find_packages(),
install_requires=[
"exorde_data",
Expand Down
1 change: 1 addition & 0 deletions exorde/self_update.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
async def self_update():
try:
latest_tag = await get_latest_tag()
logging.info(f"[CLIENT UPDATE] Updating from {local_version} to version {latest_tag}")
local_version = metadata.version("exorde")
if latest_tag != local_version:
exorde_repository_path = "git+https://github.com/exorde-labs/exorde-client.git#subdirectory=exorde&egg=exorde"
Expand Down

0 comments on commit 3bdd382

Please sign in to comment.