专栏名称: 51Testing软件测试网
51Testing软件测试网,人气最旺的软件测试技术门户,提供软件测试社区交流,软件测试博客,人才服务,测试沙龙,测试杂志,测试资料下载等全方位信息服务,是国内最专业的软件测试就业培训、企业服务供应商...
目录
相关文章推荐
51好读  ›  专栏  ›  51Testing软件测试网

Selenium爬取淘宝实战练习

51Testing软件测试网  · 公众号  · 测试  · 2017-05-04 18:31

正文


1、项目流程


2、中心调度

#中心调度

defmain():

try:

total=search()

total=int(re.compile('.*?(\d+).*?').search(total).group(1))

foriinrange(2,total+1):

next_page(i)

exceptExceptionase:

print('异常')

finally:

browser.close()

3、模拟查询

#根据关键字查询

defsearch():

try:

browser.get('https://www.taobao.com/')

#直到搜索框加载出

input_search=wait.until(EC.presence_of_element_located((By.ID,'q')))

#直到搜索按钮可以点击 submit_button=wait.until(EC.element_to_be_clickable((By.CLASS_NAME,'btn-search')))

input_search.send_keys(KEYWORDS)

submit_button.click() total=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'div.wraperdiv.total')))

get_products()

returntotal.text

exceptTimeoutExceptionase:

print('响应超时')

4、下一页的操作

#下一页爬取

defnext_page(index):

try:

input=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pagerdiv.form>input'))) submit=wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#mainsrp-pagerdiv.form>span.btn.J_Submit')))

input.clear()

input.send_keys(index)

submit.click() wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager>div>div>div>ul>li.item.active>span'),str(index)))

get_products()

exceptNoSuchElementExceptionase:

print('元素未加载')

returnnext_page(index)

5、商品信息的解析

#获取一页上所有的商品

defget_products(): wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#mainsrp-itemlist.items.item")))

html=browser.page_source

doc=PyQuery(html)

items=doc('.m-itemlist.items.item').items()

foriteminitems:

product={

'image':item.find('.pic-link.img').attr('data-src'),

'price':float(item.find('.price').text()[2:]),

'deal':item.find('.deal-cnt').text()[:-3],

'title':item.find('.title').text(),

'shop':item.find('.shop').text(),

'location':item.find('.location').text(),

'keywords':KEYWORDS

}

save_to_mongo(product=product)

6、完整代码

#!/usr/bin/python

#-*-coding:utf-8-*-

importpymongo

importre

frompyqueryimportPyQuery

fromseleniumimportwebdriver

fromselenium.webdriver.common.byimportBy

fromselenium.webdriver.support.uiimportWebDriverWait

fromselenium.webdriver.supportimportexpected_conditionsasEC fromselenium.common.exceptionsimportTimeoutException,NoSuchElementException

fromsettingimport*

client=pymongo.MongoClient(MONGO_HOST)

db=client[MONGO_DB]

browser=webdriver.Chrome()

wait=WebDriverWait(browser,10)

#根据关键字查询

defsearch():

try:

browser.get('https://www.taobao.com/')

#直到搜索框加载出

input_search=wait.until(EC.presence_of_element_located((By.ID,'q')))

#直到搜索按钮可以点击 submit_button=wait.until(EC.element_to_be_clickable((By.CLASS_NAME,'btn-search')))

input_search.send_keys(KEYWORDS)

submit_button.click()

total=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'div.wraperdiv.total')))

get_products()

returntotal.text

exceptTimeoutExceptionase:

print('响应超时')

#下一页爬取

defnext_page(index):

try:

input=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pagerdiv.form>input'))) submit=wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#mainsrp-pagerdiv.form>span.btn.J_Submit')))

input.clear()

input.send_keys(index)

submit.click() wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager>div>div>div>ul>li.item.active>span'),str(index)))

get_products()

exceptNoSuchElementExceptionase:

print('元素未加载')

returnnext_page(index)

#获取一页上所有的商品

defget_products(): wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#mainsrp-itemlist.items.item")))

html=browser.page_source

doc=PyQuery(html)

items=doc('.m-itemlist.items.item').items()

foriteminitems:

product={

'image':item.find('.pic-link.img').attr('data-src'),

'price':float(item.find('.price').text()[2:]),

'deal':item.find('.deal-cnt').text()[:-3],

'title':item.find('.title').text(),

'shop':item.find('.shop').text(),

'location':item.find('.location').text(),

'keywords':KEYWORDS

}

save_to_mongo(product=product)

#保存至mongoDB

defsave_to_mongo(product):

try:

ifdb[MONGO_TABLE].insert(product):

print('保存成功',product)

exceptException:

print('保存失败')

#中心调度

defmain():

try:

total=search()

total=int(re.compile('.*?(\d+).*?').search(total).group(1))

foriinrange(2,total+1):

next_page(i)

exceptExceptionase:

print('异常')

finally:

browser.close()

if__name__=='__main__':

main()

7、运行结果


推荐阅读






请到「今天看啥」查看全文