From d1f7a29500c4b780bed8577f4042df83e1d76f10 Mon Sep 17 00:00:00 2001 From: Hasan Huseyin Semiz Date: Mon, 14 Jun 2021 00:06:47 +0200 Subject: [PATCH] update --- ...aping_Selenium_GoogleMaps-checkpoint.ipynb | 367 ++++++++++++++++++ 1 file changed, 367 insertions(+) create mode 100644 GoogleMaps_Scraping/.ipynb_checkpoints/WebScraping_Selenium_GoogleMaps-checkpoint.ipynb diff --git a/GoogleMaps_Scraping/.ipynb_checkpoints/WebScraping_Selenium_GoogleMaps-checkpoint.ipynb b/GoogleMaps_Scraping/.ipynb_checkpoints/WebScraping_Selenium_GoogleMaps-checkpoint.ipynb new file mode 100644 index 0000000..d75ca9f --- /dev/null +++ b/GoogleMaps_Scraping/.ipynb_checkpoints/WebScraping_Selenium_GoogleMaps-checkpoint.ipynb @@ -0,0 +1,367 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from selenium import webdriver\n", + "from selenium.webdriver.chrome.options import Options\n", + "import time\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "klinikName_list_last = []\n", + "text_list_last = []\n", + "date_list_last = []\n", + "star_list_last = []\n", + "like_list_last = []" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "def scraping(url):\n", + " DRIVER_PATH = '../GoogleMaps_Scraping/chromedriver'\n", + " options = webdriver.ChromeOptions()\n", + " \n", + " #chromedriver language change to German\n", + " options.add_experimental_option('prefs', {'intl.accept_languages': 'de'})\n", + " driver = webdriver.Chrome(executable_path=DRIVER_PATH,options=options)\n", + " \n", + " driver.get(url)\n", + " driver.maximize_window()\n", + " time.sleep(3)\n", + " \n", + " #Cookies\n", + " driver.find_element_by_xpath('/html/body/c-wiz/div/div/div/div[2]/div[1]/div[4]/form/div[1]/div/button/span').click()\n", + " time.sleep(4)\n", + "\n", + " klinikName = driver.find_element_by_id(\"searchboxinput\").get_attribute(\"value\")\n", + "\n", + " #bewertungen clicken\n", + " driver.find_element_by_css_selector('.widget-pane-link').click()\n", + " time.sleep(3)\n", + "\n", + " #scroll\n", + " jscommand = \"\"\"\n", + " berichte = document.querySelector(\".section-layout.section-scrollbox.cYB2Ge-oHo7ed.cYB2Ge-ti6hGc\");\n", + " berichte.scrollTo(0, berichte.scrollHeight);var lenOfPage=berichte.scrollHeight;return lenOfPage;\n", + " \"\"\"\n", + " lenOfPage = driver.execute_script(jscommand)\n", + " match=False\n", + " while(match==False):\n", + " lastCount = lenOfPage\n", + " time.sleep(1)\n", + " lenOfPage = driver.execute_script(jscommand)\n", + " if lastCount == lenOfPage:\n", + " match=True\n", + " time.sleep(1)\n", + " \n", + " #mehr button click\n", + " loads = driver.find_elements_by_css_selector('.ODSEW-KoToPc-ShBeI.gXqMYb-hSRGPd')\n", + " for load in loads:\n", + " load.click()\n", + " time.sleep(1)\n", + " \n", + " texts = driver.find_elements_by_xpath('//div[@class=\"ODSEW-ShBeI-ShBeI-content\"]/span[2]')\n", + " dates = driver.find_elements_by_xpath('//span[@class=\"ODSEW-ShBeI-RgZmSc-date\"]')\n", + " stars = driver.find_elements_by_xpath('//div[@class=\"ODSEW-ShBeI-jfdpUb\"]/span[2]')\n", + " likes = driver.find_elements_by_xpath('//button[@class=\"ODSEW-ShBeI-Sc2xXc-LgbsSe\"]/span/span[2]')\n", + "\n", + " klinikName_list = [klinikName for a in range(len(texts))]\n", + " text_list = [a.text for a in texts]\n", + " date_list = [a.text for a in dates]\n", + " star_list = [a.get_attribute(\"aria-label\")[1] for a in stars]\n", + "\n", + " likes_list =[]\n", + " for t in likes:\n", + " if t.text =='':\n", + " likes_list.append('Keine')\n", + " else:\n", + " likes_list.append(t.text)\n", + "\n", + " like_list=[] \n", + " for t in range(len(texts)):\n", + " if t>=len(likes_list):\n", + " like_list.append('Keine')\n", + " else:\n", + " like_list.append(likes_list[t])\n", + " \n", + " for i in klinikName_list:\n", + " klinikName_list_last.append(i)\n", + " for i in text_list:\n", + " text_list_last.append(i)\n", + " for i in date_list:\n", + " date_list_last.append(i)\n", + " for i in star_list:\n", + " star_list_last.append(i)\n", + " for i in like_list:\n", + " like_list_last.append(i) \n", + " \n", + "\n", + " driver.quit()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "ename": "NoSuchWindowException", + "evalue": "Message: no such window: window was already closed\n (Session info: chrome=91.0.4472.101)\n", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNoSuchWindowException\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mdf_url\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_excel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mr'Klinikliste.xlsx'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;36m24\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"Link Google Maps\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0murl\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdf_url\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mscraping\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m\u001b[0m in \u001b[0;36mscraping\u001b[0;34m(url)\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0;31m#Cookies\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 14\u001b[0;31m \u001b[0mdriver\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfind_element_by_xpath\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'/html/body/c-wiz/div/div/div/div[2]/div[1]/div[4]/form/div[1]/div/button/span'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclick\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 15\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msleep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m4\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/anaconda3/lib/python3.8/site-packages/selenium/webdriver/remote/webdriver.py\u001b[0m in \u001b[0;36mfind_element_by_xpath\u001b[0;34m(self, xpath)\u001b[0m\n\u001b[1;32m 392\u001b[0m \u001b[0melement\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdriver\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfind_element_by_xpath\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'//div/td[1]'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 393\u001b[0m \"\"\"\n\u001b[0;32m--> 394\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfind_element\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mby\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mBy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mXPATH\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mxpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 395\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 396\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mfind_elements_by_xpath\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mxpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/anaconda3/lib/python3.8/site-packages/selenium/webdriver/remote/webdriver.py\u001b[0m in \u001b[0;36mfind_element\u001b[0;34m(self, by, value)\u001b[0m\n\u001b[1;32m 974\u001b[0m \u001b[0mby\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mBy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mCSS_SELECTOR\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 975\u001b[0m \u001b[0mvalue\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'[name=\"%s\"]'\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 976\u001b[0;31m return self.execute(Command.FIND_ELEMENT, {\n\u001b[0m\u001b[1;32m 977\u001b[0m \u001b[0;34m'using'\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mby\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 978\u001b[0m 'value': value})['value']\n", + "\u001b[0;32m/opt/anaconda3/lib/python3.8/site-packages/selenium/webdriver/remote/webdriver.py\u001b[0m in \u001b[0;36mexecute\u001b[0;34m(self, driver_command, params)\u001b[0m\n\u001b[1;32m 319\u001b[0m \u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcommand_executor\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdriver_command\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 320\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mresponse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 321\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0merror_handler\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcheck_response\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresponse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 322\u001b[0m response['value'] = self._unwrap_value(\n\u001b[1;32m 323\u001b[0m response.get('value', None))\n", + "\u001b[0;32m/opt/anaconda3/lib/python3.8/site-packages/selenium/webdriver/remote/errorhandler.py\u001b[0m in \u001b[0;36mcheck_response\u001b[0;34m(self, response)\u001b[0m\n\u001b[1;32m 240\u001b[0m \u001b[0malert_text\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'alert'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'text'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 241\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mexception_class\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmessage\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mscreen\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstacktrace\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0malert_text\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 242\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mexception_class\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmessage\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mscreen\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstacktrace\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 243\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 244\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_value_or_default\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mobj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdefault\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mNoSuchWindowException\u001b[0m: Message: no such window: window was already closed\n (Session info: chrome=91.0.4472.101)\n" + ] + } + ], + "source": [ + "#Main\n", + "df_url = pd.read_excel(r'Klinikliste.xlsx')[:24][\"Link Google Maps\"]\n", + "for url in df_url:\n", + " scraping(url)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2308\n", + "2308\n", + "2308\n", + "2308\n", + "2308\n" + ] + } + ], + "source": [ + "print(len(klinikName_list_last))\n", + "print(len(text_list_last))\n", + "print(len(date_list_last))\n", + "print(len(star_list_last))\n", + "print(len(like_list_last))" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.DataFrame(zip(klinikName_list_last,text_list_last,date_list_last,star_list_last,like_list_last), columns=[\"Name der Klinik\",\"Bewertung\",\"Datum der Bewertung\",\"Sternebewertung\",'Likes'])\n", + "df.to_csv('GoogleMaps.csv', index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Name der KlinikBewertungDatum der BewertungSternebewertungLikes
0Herzogin Elisabeth HospitalVor der Corona Pandemie hätte ich jeden zugest...vor 3 Wochen21
1Herzogin Elisabeth HospitalTolles Krankenhaus, trotzdem bin ich lieber g...vor einem Jahr41
2Herzogin Elisabeth HospitalLungenembolie!\\nEin Tag ITS, phantastische Für...vor 2 Monaten51
3Herzogin Elisabeth HospitalAlles sehr gut. Volle Punktzahl!vor 3 Jahren54
4Herzogin Elisabeth HospitalIch werde nie wieder in ein anderes Krankenhau...vor 2 Monaten51
5Herzogin Elisabeth HospitalIch bin begeistert von der Leistung des Assist...vor einem Monat5Keine
6Herzogin Elisabeth HospitalEine 100%tige Empfehlung.\\nTolles Personal, al...vor einem Monat51
7Herzogin Elisabeth HospitalAm 20.02.2021 mit akutem Herzinfarkt puppenlus...vor einem Monat42
8Herzogin Elisabeth HospitalVor 6 Wochen Hüft Tep links bekommen. Fühlte m...vor 2 Monaten51
9Herzogin Elisabeth HospitalOP 07/20. Mir wurde von Dr. Sherstyuk ein 2tes...vor 2 Monaten51
\n", + "
" + ], + "text/plain": [ + " Name der Klinik \\\n", + "0 Herzogin Elisabeth Hospital \n", + "1 Herzogin Elisabeth Hospital \n", + "2 Herzogin Elisabeth Hospital \n", + "3 Herzogin Elisabeth Hospital \n", + "4 Herzogin Elisabeth Hospital \n", + "5 Herzogin Elisabeth Hospital \n", + "6 Herzogin Elisabeth Hospital \n", + "7 Herzogin Elisabeth Hospital \n", + "8 Herzogin Elisabeth Hospital \n", + "9 Herzogin Elisabeth Hospital \n", + "\n", + " Bewertung Datum der Bewertung \\\n", + "0 Vor der Corona Pandemie hätte ich jeden zugest... vor 3 Wochen \n", + "1 Tolles Krankenhaus, trotzdem bin ich lieber g... vor einem Jahr \n", + "2 Lungenembolie!\\nEin Tag ITS, phantastische Für... vor 2 Monaten \n", + "3 Alles sehr gut. Volle Punktzahl! vor 3 Jahren \n", + "4 Ich werde nie wieder in ein anderes Krankenhau... vor 2 Monaten \n", + "5 Ich bin begeistert von der Leistung des Assist... vor einem Monat \n", + "6 Eine 100%tige Empfehlung.\\nTolles Personal, al... vor einem Monat \n", + "7 Am 20.02.2021 mit akutem Herzinfarkt puppenlus... vor einem Monat \n", + "8 Vor 6 Wochen Hüft Tep links bekommen. Fühlte m... vor 2 Monaten \n", + "9 OP 07/20. Mir wurde von Dr. Sherstyuk ein 2tes... vor 2 Monaten \n", + "\n", + " Sternebewertung Likes \n", + "0 2 1 \n", + "1 4 1 \n", + "2 5 1 \n", + "3 5 4 \n", + "4 5 1 \n", + "5 5 Keine \n", + "6 5 1 \n", + "7 4 2 \n", + "8 5 1 \n", + "9 5 1 " + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head(10)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}