From 8746304685bb2549461193e56c2138b98137c146 Mon Sep 17 00:00:00 2001 From: Hajime Taniguchi Date: Sun, 10 Jan 2021 02:13:33 +0900 Subject: [PATCH 1/4] =?UTF-8?q?innerHTML=E3=82=92text=E3=81=AB=E4=BF=AE?= =?UTF-8?q?=E6=AD=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- task02/kadai1and2.py | 4 ++-- task02/kadai5.py | 6 +++--- task02/kadai6.py | 6 +++--- task02/kadai7.py | 7 +++---- 4 files changed, 11 insertions(+), 12 deletions(-) diff --git a/task02/kadai1and2.py b/task02/kadai1and2.py index b41e258..9acc196 100644 --- a/task02/kadai1and2.py +++ b/task02/kadai1and2.py @@ -64,8 +64,8 @@ def main(): content_list = info.find_element_by_class_name("cassetteRecruit__main") tr_list = content_list.find_elements_by_tag_name("tr") for tr in tr_list: - item = tr.find_element_by_tag_name("th").get_attribute("innerHTML") - value = tr.find_element_by_tag_name("td").get_attribute("innerHTML") + item = tr.find_element_by_tag_name("th").test + value = tr.find_element_by_tag_name("td").text print("item:", item, "value:", value) diff --git a/task02/kadai5.py b/task02/kadai5.py index 98a034a..8bb4a6a 100644 --- a/task02/kadai5.py +++ b/task02/kadai5.py @@ -62,15 +62,15 @@ def main(): colum_name = [] for info in job_info: job_datum = [] - name = info.find_element_by_class_name("cassetteRecruit__name").get_attribute("innerHTML") + name = info.find_element_by_class_name("cassetteRecruit__name").text content_list = info.find_element_by_class_name("cassetteRecruit__main") tr_list = content_list.find_elements_by_tag_name("tr") job_datum.append(name) colum_name.clear for tr in tr_list: - item = tr.find_element_by_tag_name("th").get_attribute("innerHTML") - value = tr.find_element_by_tag_name("td").get_attribute("innerHTML") + item = tr.find_element_by_tag_name("th").text + value = tr.find_element_by_tag_name("td").text print("item:", item, "value:", value) job_datum.append(value) diff --git a/task02/kadai6.py b/task02/kadai6.py index aff1a9e..03e39bb 100644 --- a/task02/kadai6.py +++ b/task02/kadai6.py @@ -67,7 +67,7 @@ def main(): element = "" try: element = "会社名" - name = info.find_element_by_class_name("cassetteRecruit__name").get_attribute("innerHTML") + name = info.find_element_by_class_name("cassetteRecruit__name").text content_list = info.find_element_by_class_name("cassetteRecruit__main") element = "仕事条件" tr_list = content_list.find_elements_by_tag_name("tr") @@ -76,9 +76,9 @@ def main(): colum_name.clear for tr in tr_list: element = "項目名" - item = tr.find_element_by_tag_name("th").get_attribute("innerHTML") + item = tr.find_element_by_tag_name("th").text element = "内容" - value = tr.find_element_by_tag_name("td").get_attribute("innerHTML") + value = tr.find_element_by_tag_name("td").text print("item:", item, "value:", value) job_datum.append(value) colum_name.append(item) ## <-何度も回って少し無駄な気しますが、わざわざそれだけ取り出すコード書くのも面倒なので、、、 diff --git a/task02/kadai7.py b/task02/kadai7.py index 458f9ff..b5227d0 100644 --- a/task02/kadai7.py +++ b/task02/kadai7.py @@ -86,7 +86,7 @@ def main(): element = "" try: element = "会社名" - name = info.find_element_by_class_name("cassetteRecruit__name").get_attribute("innerHTML") + name = info.find_element_by_class_name("cassetteRecruit__name").text content_list = info.find_element_by_class_name("cassetteRecruit__main") element = "仕事条件" tr_list = content_list.find_elements_by_tag_name("tr") @@ -95,9 +95,8 @@ def main(): colum_name.clear for tr in tr_list: element = "項目名" - item = tr.find_element_by_tag_name("th").get_attribute("innerHTML") - element = "内容" - value = tr.find_element_by_tag_name("td").get_attribute("innerHTML") + item = tr.find_element_by_tag_name("th").text + value = tr.find_element_by_tag_name("td").text print("item:", item, "value:", value) job_datum.append(value) colum_name.append(item) ## <-何度も回って少し無駄な気しますが、わざわざそれだけ取り出すコード書くのも面倒なので、、、 From 9323123ceaa9e341740966cf4dc4278dbc934574 Mon Sep 17 00:00:00 2001 From: Hajime Taniguchi Date: Sun, 10 Jan 2021 02:17:34 +0900 Subject: [PATCH 2/4] =?UTF-8?q?=5F=5F=E3=82=A2=E3=83=B3=E3=83=80=E3=83=BC?= =?UTF-8?q?=E3=82=B9=E3=82=B3=E3=82=A2=E3=81=A7=E5=A7=8B=E3=81=BE=E3=82=8B?= =?UTF-8?q?def=E3=82=92=E4=BF=AE=E6=AD=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- task02/kadai3.py | 16 ++++++++-------- task02/kadai7.py | 20 ++++++++++---------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/task02/kadai3.py b/task02/kadai3.py index 66b2167..d515cee 100644 --- a/task02/kadai3.py +++ b/task02/kadai3.py @@ -4,7 +4,7 @@ import pandas as pd # Chromeを起動する関数 -def __set_driver(driver_path, headless_flg): +def set_driver(driver_path, headless_flg): # Chromeドライバーの読み込み options = ChromeOptions() @@ -23,7 +23,7 @@ def __set_driver(driver_path, headless_flg): # ChromeのWebDriverオブジェクトを作成する。 return Chrome(executable_path=os.getcwd() + "/" + driver_path, options=options) -def __getName(driver): +def getName(driver): # ページ終了まで繰り返し取得 exp_name_list = [] # 検索結果の一番上の会社名を取得 @@ -35,7 +35,7 @@ def __getName(driver): exp_name_list.append(name.text) print(name.text) -def __checkNextPage(driver): +def checkNextPage(driver): next_link = driver.find_elements_by_class_name("pager__next") if len(next_link) > 0: print("going to load next page") @@ -49,9 +49,9 @@ def main(): search_keyword = "高収入" # driverを起動 if os.name == 'nt': #Windows - driver = __set_driver("chromedriver.exe", False) + driver = set_driver("chromedriver.exe", False) elif os.name == 'posix': #Mac - driver = __set_driver("chromedriver", False) + driver = set_driver("chromedriver", False) # Webサイトを開く driver.get("https://tenshoku.mynavi.jp/") time.sleep(5) @@ -73,11 +73,11 @@ def main(): ## 課題3 ## まず会社名を表示 - __getName(driver) + getName(driver) ## 次のページへのリンクが有る場合はクリックし、会社名を表示し続ける - while __checkNextPage(driver) == True: - __getName(driver) + while checkNextPage(driver) == True: + getName(driver) # 直接起動された場合はmain()を起動(モジュールとして呼び出された場合は起動しないようにするため) diff --git a/task02/kadai7.py b/task02/kadai7.py index b5227d0..9b78a93 100644 --- a/task02/kadai7.py +++ b/task02/kadai7.py @@ -31,20 +31,20 @@ def set_driver(driver_path, headless_flg): return Chrome(executable_path=os.getcwd() + "/" + driver_path, options=options) -def __get_time(): +def get_time(): now = dt.datetime.now().strftime("%Y%m%d") return now -def __take_log(event): - now = __get_time() +def take_log(event): + now = get_time() log = f"{now}, {event} {new_line_code}" with open("log.txt", mode="a", encoding="utf-8_sig") as f: f.writelines(log) -# main処理 +# main def main(): - __take_log("起動した") + take_log("起動した") search_keyword = "高収入" # driverを起動 if os.name == 'nt': #Windows @@ -54,7 +54,7 @@ def main(): # Webサイトを開く driver.get("https://tenshoku.mynavi.jp/") - __take_log("ページの読み込み成功") + take_log("ページの読み込み成功") time.sleep(5) try: @@ -71,13 +71,13 @@ def main(): driver.find_element_by_class_name("topSearch__text").send_keys(search_keyword) # 検索ボタンクリック driver.find_element_by_class_name("topSearch__button").click() - __take_log("キーワード送信成功") + take_log("キーワード送信成功") ## 課題5に tryを追加, ## 変数を用い、どの要素が問題かわかるようにしてみた。 - __take_log("スクレイピング開始") + take_log("スクレイピング開始") job_info = driver.find_elements_by_class_name("cassetteRecruit") job_data = [] colum_name = [] @@ -104,10 +104,10 @@ def main(): job_data.append(job_datum) colum_name.insert(0, "会社名") except exception as e: - __take_log("error while getting " + element) + take_log("error while getting " + element) pass else: - __take_log(f"succeeded scraping {name}") + take_log(f"succeeded scraping {name}") pass finally: print(job_data) From a314ac06abb4e43621ebb6acda5afbc43ff418e2 Mon Sep 17 00:00:00 2001 From: Hajime Taniguchi Date: Sun, 10 Jan 2021 02:21:14 +0900 Subject: [PATCH 3/4] =?UTF-8?q?element=E3=82=92=E5=89=8A=E9=99=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- task02/kadai4.py | 3 +-- task02/kadai5.py | 3 +-- task02/kadai6.py | 7 +------ task02/kadai7.py | 6 +----- 4 files changed, 4 insertions(+), 15 deletions(-) diff --git a/task02/kadai4.py b/task02/kadai4.py index cfc4d94..e3fedf2 100644 --- a/task02/kadai4.py +++ b/task02/kadai4.py @@ -55,8 +55,7 @@ def main(): pass # 検索窓に入力 - driver.find_element_by_class_name( - "topSearch__text").send_keys(search_keyword) + driver.find_element_by_class_name("topSearch__text").send_keys(search_keyword) # 検索ボタンクリック driver.find_element_by_class_name("topSearch__button").click() diff --git a/task02/kadai5.py b/task02/kadai5.py index 8bb4a6a..9f81272 100644 --- a/task02/kadai5.py +++ b/task02/kadai5.py @@ -49,8 +49,7 @@ def main(): pass # 検索窓に入力 - driver.find_element_by_class_name( - "topSearch__text").send_keys(search_keyword) + driver.find_element_by_class_name("topSearch__text").send_keys(search_keyword) # 検索ボタンクリック driver.find_element_by_class_name("topSearch__button").click() diff --git a/task02/kadai6.py b/task02/kadai6.py index 03e39bb..8aa543d 100644 --- a/task02/kadai6.py +++ b/task02/kadai6.py @@ -64,20 +64,15 @@ def main(): colum_name = [] for info in job_info: job_datum = [] - element = "" try: - element = "会社名" name = info.find_element_by_class_name("cassetteRecruit__name").text content_list = info.find_element_by_class_name("cassetteRecruit__main") - element = "仕事条件" tr_list = content_list.find_elements_by_tag_name("tr") job_datum.append(name) colum_name.clear for tr in tr_list: - element = "項目名" item = tr.find_element_by_tag_name("th").text - element = "内容" value = tr.find_element_by_tag_name("td").text print("item:", item, "value:", value) job_datum.append(value) @@ -86,7 +81,7 @@ def main(): job_data.append(job_datum) colum_name.insert(0, "会社名") except exception as e: - print(e, "while getting ", element) + print(e) pass else: print("succeeded") diff --git a/task02/kadai7.py b/task02/kadai7.py index 9b78a93..4f82ff4 100644 --- a/task02/kadai7.py +++ b/task02/kadai7.py @@ -83,18 +83,14 @@ def main(): colum_name = [] for info in job_info: job_datum = [] - element = "" try: - element = "会社名" name = info.find_element_by_class_name("cassetteRecruit__name").text content_list = info.find_element_by_class_name("cassetteRecruit__main") - element = "仕事条件" tr_list = content_list.find_elements_by_tag_name("tr") job_datum.append(name) colum_name.clear for tr in tr_list: - element = "項目名" item = tr.find_element_by_tag_name("th").text value = tr.find_element_by_tag_name("td").text print("item:", item, "value:", value) @@ -104,7 +100,7 @@ def main(): job_data.append(job_datum) colum_name.insert(0, "会社名") except exception as e: - take_log("error while getting " + element) + take_log("error while getting " + e) pass else: take_log(f"succeeded scraping {name}") From 17cc5e9c0c0c79a53c9f6e5730e4668ae9a2d766 Mon Sep 17 00:00:00 2001 From: Hajime Taniguchi Date: Sun, 10 Jan 2021 02:26:49 +0900 Subject: [PATCH 4/4] =?UTF-8?q?pandas=E3=81=AEcolumns=E3=82=92=E6=96=87?= =?UTF-8?q?=E5=AD=97=E5=88=97=E3=81=AB=E5=A4=89=E6=9B=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- task02/kadai5.py | 2 ++ task02/kadai6.py | 2 ++ task02/kadai7.py | 3 +++ 3 files changed, 7 insertions(+) diff --git a/task02/kadai5.py b/task02/kadai5.py index 9f81272..7c1ff33 100644 --- a/task02/kadai5.py +++ b/task02/kadai5.py @@ -80,6 +80,8 @@ def main(): print(job_data) df = pd.DataFrame(job_data) + if len(df.columns) == len(colum_name): + df.columns = colum_name print(df) df.to_csv("data.csv") diff --git a/task02/kadai6.py b/task02/kadai6.py index 8aa543d..2988a23 100644 --- a/task02/kadai6.py +++ b/task02/kadai6.py @@ -91,6 +91,8 @@ def main(): df = pd.DataFrame(job_data) + if len(df.columns) == len(colum_name): + df.columns = colum_name print(df) df.to_csv("data.csv") diff --git a/task02/kadai7.py b/task02/kadai7.py index 4f82ff4..a51dc9c 100644 --- a/task02/kadai7.py +++ b/task02/kadai7.py @@ -110,6 +110,9 @@ def main(): df = pd.DataFrame(job_data) + if len(df.columns) == len(colum_name): + df.columns = colum_name + print(df) df.to_csv("data.csv")