from selenium.webdriver.common.by import By
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
from openpyxl import Workbook
from bs4 import BeautifulSoup
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
import time
import datetime
import requests
# Å©·Ò ¿É¼Ç ¼³Á¤
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options as ChromeOptions
options = ChromeOptions()
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"
options.add_argument('user-agent=' + user_agent)
options.add_argument("lang=ko_KR")
options.add_argument('headless') # Çìµå¸®½º ¸ðµå·Î ½ÇÇà (ºê¶ó¿ìÀú âÀ» ¶ç¿ìÁö ¾ÊÀ½)
options.add_argument('window-size=1920x1080')
options.add_argument("disable-gpu")
options.add_argument("--no-sandbox")
# Å©·Ò µå¶óÀ̹ö ÃֽŠ¹öÀü ¼³Á¤
service = ChromeService(executable_path=ChromeDriverManager().install())
# Å©·Ò µå¶óÀ̹ö ½ÇÇà
driver = webdriver.Chrome(service=service, options=options)
# url
url = 'https://m.place.naver.com/restaurant/1085956231/review/visitor?entry=ple&reviewSort=recent'
# BS4 setting for secondary access
session = requests.Session()
headers = {
"User-Agent": "user value"}
retries = Retry(total=5,
backoff_factor=0.1,
status_forcelist=[500, 502, 503, 504])
session.mount('http://', HTTPAdapter(max_retries=retries))
# New xlsx file
now = datetime.datetime.now()
xlsx = Workbook()
list_sheet = xlsx.create_sheet('output')
list_sheet.append(['nickname', 'content', 'date', 'revisit'])
# Start crawling/scraping!
try:
print("Starting webdriver and accessing URL...")
driver.get(url)
driver.implicitly_wait(30)
print("Page loaded. Scrolling down...")
driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.PAGE_DOWN)
count = 0
try:
while True:
print("Clicking on '´õº¸±â' button...")
driver.find_element(By.XPATH, '//*[@id="app-root"]/div/div/div/div[6]/div[2]/div[3]/div[2]/div/a').click()
count += 1
print(f"'´õº¸±â' button clicked {count} times.")
time.sleep(0.4)
except Exception as e:
print(f'No more "´õº¸±â" button found, finished scrolling after {count} clicks.')
time.sleep(25)
html = driver.page_source
bs = BeautifulSoup(html, 'lxml')
reviews = bs.select('li.YlrAu')
print(f"Found {len(reviews)} reviews.")
for r in reviews:
nickname = r.select_one('div.VYGLG')
content = r.select_one('div.vg7Fp.CyA_N')
date = r.select('div.D40bm>span.CKUdu>time')[0]
revisit = r.select('div.D40bm>span.CKUdu')[1]
# exception handling
nickname = nickname.text if nickname else ''
content = content.text if content else ''
date = date.text if date else ''
revisit = revisit.text if revisit else ''
time.sleep(0.06)
print(f"Review: {nickname} / {content} / {date} / {revisit}")
list_sheet.append([nickname, content, date, revisit])
time.sleep(0.06)
# Save the file
file_name = 'naver_review_' + now.strftime('%Y-%m-%d_%H-%M-%S') + '.xlsx'
xlsx.save(file_name)
print(f"File saved as {file_name}")
except Exception as e:
print(f"Exception occurred: {e}")
# Save the file(temp)
file_name = 'naver_review_' + now.strftime('%Y-%m-%d_%H-%M-%S') + '.xlsx'
xlsx.save(file_name)
print(f"File saved as {file_name} after exception")
finally:
driver.quit()
print("Webdriver closed.")
³×À̹ö Áöµµ ¸®ºä Å©·Ñ¸µ ÄÚµå À̸ç, À§ ¿Í°°ÀÌ ÀÛ¼º½Ã ¿¢¼¿ÆÄÀÏÀº »ý¼ºµÇÁö¸¸, Å©·Ñ¸µÀÌ ÀüÇô¾ÈµÇ°íÀÖ´Â »óȲÀÔ´Ï´Ù. ¾Æ·¡´Â °á°ú°ªÀÔ´Ï´Ù.
DevTools listening on ws://127.0.0.1:51574/devtools/browser/16977761-9899-4120-9ba5-2ef94f71fbc6 Starting webdriver and accessing URL... Page loaded. Scrolling down... Clicking on '´õº¸±â' button... No more "´õº¸±â" button found, finished scrolling after 0 clicks. Found 0 reviews. File saved as naver_review_2024-06-07_10-45-31.xlsx Webdriver closed. PS C:\Users\ooooo\
À§´Â °á°ú°ª ÀÔ´Ï´Ù. ¿¢¼¿ÆÄÀÏÀº »ý¼ºÀÌ µÇÁö¸¸ ÀüÇô ¸®ºä¸¦ ÀÐÁö ¸øÇÏ´Â »óȲÀÔ´Ï´Ù. µµ¿òÀ» ÁÖ½Ã¸é °¨»çµå¸®°Ú½À´Ï´Ù. |
À¥ÆäÀÌÁö ÁÖ¼Ò¸¦ ¸ð¹ÙÀÏ ÁÖ¼Ò°¡ ¾Æ´Ñ ÇǾ¾·Î º¯°æÇÏ°í
bs = BeautifulSoup(html, 'html.parser')
·Î º¯°æÇϸé ÆäÀÌÁö´Â Àоî¿À³×¿ä ±×¿Ü¿¡ ÀÛÀº divÀ̸§µé º¯°æÇØÁÖ¸é ÀÛµ¿ÇÕ´Ï´Ù