chrome.exe –remote-debugging-port=9222 –user-data-dir=”E:\data_info\selenium_data”
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
import time
import json
import requests
import pymysql
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
config={
"host":"10.50.3.116",
"user":"crm",
"password":"123456",
"database":"t8t_tbt_spider_test"
}
db = pymysql.connect(**config)
def load_photo(url, name):
'''给定图片链接,将图片以某个名称下载到本地'''
# url = 'http://img14.360buyimg.com/n1/s450x450_jfs/t1/148801/37/12770/118749/5f9d71e4E39f1e893/533675187c108953.jpg'
reponse = requests.get(url)
# name = 'd:/photo.jpg'
with open(name, 'wb') as ft:
ft.write(reponse.content)
def drop_scroll(browser):
'''将滑条从头滚动到底,以便让浏览器充分加载'''
for x in range(1, 11, 2):
# time.sleep(0.5)
j = x/10
js = 'document.documentElement.scrollTop = document.documentElement.scrollHeight * %f' % j
browser.execute_script(js)
def switch_window(browser):
'''将browser的指令移到新打开的小窗口处'''
# time.sleep(0.5) # 如果移转失败,请增大这个时间
windows = browser.window_handles
browser.switch_to.window(windows[-1])
def switch_window_back(browser):
'''将browser的指令移回旧的小窗口'''
windows = browser.window_handles
browser.switch_to.window(windows[0])
# 构造网址
u_id= input('请输入https://www.zhihu.com/people/{u_id}中的u_id')
url = f'https://www.zhihu.com/people/{u_id}'
page = int(input("要遍历的页数(从第一页开始)"))
# 打开知乎
chrome_options = Options()
chrome_options.add_experimental_option("debuggerAddress", "127.0.0.1:9222") # 前面设置的端口号
browser = webdriver.Chrome(options=chrome_options)
# browser.set_window_size(900, 500) # 设置窗口大小
# browser.set_window_position(300, 200) # 设置浏览器的位置
browser.get(url)
btn=browser.find_element("xpath","//button[@aria-label='关闭']")
browser.execute_script("arguments[0].click();", btn)
time.sleep(5)
# 遍历页面中每篇回答
count = 1 # 回答的编号
for page_id in range(45,page+1):
# 翻页
#print("Page:%d" %page_id)
#url = f'https://www.zhihu.com/people/{u_id}/answers/by_votes/?page={page_id}'
url = f'https://www.zhihu.com/org/{u_id}/answers?page={page_id}'
browser.get(url)
time.sleep(2)
btn = browser.find_element("xpath","//button[@aria-label='关闭']")
browser.execute_script("arguments[0].click();", btn)
time.sleep(5)
switch_window_back(browser) # 将browser的指令移回到新标签页
drop_scroll(browser) #滑条拖到底,让加载完全
answers = browser.find_elements("xpath","//div[@class='ContentItem AnswerItem']") # 获取所有回答
for answer in answers:
url = answer.find_element("xpath",".//a[@data-za-detail-view-element_name='Title']").get_attribute('href')
title = answer.find_element("xpath",".//a[@data-za-detail-view-element_name='Title']").text
print(count\
,answer.find_element("xpath",".//a[@data-za-detail-view-element_name='Title']").get_attribute('href')\
, answer.find_element("xpath",".//a[@data-za-detail-view-element_name='Title']").text
, sep=',')
#print(count)
nimeitime = time.time()
cursor = db.cursor()
sql="insert into sp_zhihu_list (title,url,source_from,name,create_time) values ('"+title+"','"+url+"',4,'装小蜜监理王志峰','"+str(nimeitime)+"')"
cursor.execute(sql) # 执行 sql
db.commit()
cursor.close()
count=count+1