套接字(socket)
import socket
import os
import sys
def retBanner(ip, port):
try:
socket.setdefaulttimeout(2)
s = socket.socket()
s.connect((ip, port))
banner = s.recv(1024) # 读取套接字中接下来的 1024B 数据
return banner
except:
return
lines = (
"3Com 3CDaemon FTP Server Version 2.0",
"Ability Server 2.34",
"CCProxy Telnet Service Ready",
"ESMTP TABS Mail Server for Windows NT",
"FreeFloat Ftp Server (Version 1.00),"
"IMAP4rev1 MDaemon 9.6.4 ready",
"MailEnable Service, Version: 0-1.54",
"NetDecision-HTTP-Server 1.0",
"PSO Proxy 0.9",
"SAMBAR",
"Sami FTP Server 2.0.2",
"Spipe 1.0",
"TelSrv 1.5",
"WDaemon 6.8.5",
"WinGate 6.1.1",
"Xitami",
"YahooPOPs! Simple Mail Transfer Service Ready"
)
def checkVulns(banner):
for line in lines:
if line in banner:
print('[+] Server is vulnerable: {}'.format(banner))
def main():
portList = [21,22,25,80,110,443]
for x in range(147,150):
ip = '192.168.95.{}'.format(x)
for port in portList:
banner = retBanner(ip, port)
if banner:
print('[+] {}:{}'.format(ip, banner))
checkVulns(banner)
if __name__ == '__main__':
main()
抓取网络数据
不管是半结构化数据(HTML)还是结构化数据(JSON、XML),通过解析网站提供的这些数据可以获取更多的信息。比如,通过 USGS 地震观测数据网站(https://eqrthquake.usgs.gov/earthquakes/)可以以视觉的方式看到全球地震相关信息,同时也能以不同的格式提取这些数据。
# 解析url
from urllib.parse import urlparse
uc = urlparse("https://www.baidu.com")
print(uc) # ParseResult(scheme='https', netloc='www.baidu.com', path='', params='', query='', fragment='')
# 发送请求
import requests
url = 'https://www.baidu.com'
# html = requests.get(url).text # 乱码,默认GBK解析?
html = requests.get(url).content.decode('utf-8')
print(type(html)) # <class 'str'>
print(html)
if "百度" in html:
print("包含关键字“百度”") # 打印
# 正则表达式
import requests, re
regex = r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)" # 邮箱
emails = re.findall(regex, html) # 获取返回内容中的所有邮箱
上面通过字符串匹配的方式提取到网页信息,但是不精确。需要精确解析网页内容就需要解析网页结构。 BeautifulSoup 是一款解析网页结构的项目产品。官网地址 https://www.crummy.com/software/BeautifulSoup/。
pip install beautifulsoup4
from bs4 import BeautifulSoup
import requests
url = 'https://www.baidu.com'
html = requests.get(url).content.decode('utf-8')
# 解析网页标签
sp = BeautifulSoup(html, 'html.parser')
# 获取所有链接
links = sp.find_all('a')
for l in links:
print(type(l), l)
# <class 'bs4.element.Tag'> <a class="mnav" href="http://news.baidu.com" name="tj_trnews">新闻</a>
# <class 'bs4.element.Tag'> <a class="mnav" href="https://www.hao123.com" name="tj_trhao123">hao123</a>
# <class 'bs4.element.Tag'> <a class="mnav" href="http://map.baidu.com" name="tj_trmap">地图</a>
# <class 'bs4.element.Tag'> <a class="mnav" href="http://v.baidu.com" name="tj_trvideo">视频</a>
# <class 'bs4.element.Tag'> <a class="mnav" href="http://tieba.baidu.com" name="tj_trtieba">贴吧</a>
# <class 'bs4.element.Tag'> <a class="lb" href="http://www.baidu.com/bdorz/login.gif?login&tpl=mn&u=http%3A%2F%2Fwww.baidu.com%2f%3fbdorz_come%3d1" name="tj_login">登录</a>
# <class 'bs4.element.Tag'> <a class="bri" href="//www.baidu.com/more/" name="tj_briicon" style="display: block;">更多产品</a>
# <class 'bs4.element.Tag'> <a href="http://home.baidu.com">关于百度</a>
# <class 'bs4.element.Tag'> <a href="http://ir.baidu.com">About Baidu</a>
# <class 'bs4.element.Tag'> <a href="http://www.baidu.com/duty/">使用百度前必读</a>
# <class 'bs4.element.Tag'> <a class="cp-feedback" href="http://jianyi.baidu.com/">意见反馈</a>
print(links[0].contents) # ['新闻']
print(links[0].get('href')) # http://news.baidu.com
DEMO: 拉取网站图片
# _*_ coding: utf-8 _*_ from functools import reduce import pathlib from bs4 import BeautifulSoup import requests import sys, os from urllib.parse import urlparse from urllib.request import urlopen if len(sys.argv) < 2: print("请传入 url 参数") exit(1) url = sys.argv[1] domain = "{}://{}".format(urlparse(url).scheme, urlparse(url).hostname) r = requests.get(url) r.encoding = r.apparent_encoding # 网页编码 html = r.text sp = BeautifulSoup(html, 'html.parser') all_links = sp.find_all(['a', 'img']) for link in all_links: src = link.get('src') href = link.get('href') targets = [src, href] for t in targets: if t is None or reduce(lambda s, i: s and i not in t, ('.jpg', '.png', '.gif'), True): continue if t.startswith('http'): full_path = t elif t.startswith('//'): full_path = "https:" + t else: full_path = domain + t print("Downloading: " + full_path) image_dir = os.path.join(os.path.dirname(__file__), "target", url.split('/')[-1]) if not os.path.exists(image_dir): pathlib.Path(image_dir).mkdir(parents=True, exist_ok=True) filename = (lambda s: s.path[s.path.rfind('/')+1:])(urlparse(full_path)) image = urlopen(full_path) fp = open(os.path.join(image_dir, filename), 'wb') fp.write(image.read()) fp.close() print("Downloaded: " + full_path)
$ xxxxxx\vuepress2-note\code\demo-python\network\download-image.py https://www.baidu.com Downloading: https://www.baidu.com/img/bd_logo1.png Downloaded: https://www.baidu.com/img/bd_logo1.png Downloading: http://www.baidu.com/bdorz/login.gif?login&tpl=mn&u=http%3A%2F%2Fwww.baidu.com%2f%3fbdorz_come%3d1 Downloaded: http://www.baidu.com/bdorz/login.gif?login&tpl=mn&u=http%3A%2F%2Fwww.baidu.com%2f%3fbdorz_come%3d1 Downloading: https://www.baidu.com/img/gs.gif Downloaded: https://www.baidu.com/img/gs.gif $ tree xxxxxx\vuepress2-note\code\demo-python\network\target target └── www.baidu.com ├── bd_logo1.png ├── gs.gif └── login.gif
检测页面内容是否更新
import requests
import hashlib
r = requests.get('http://target.web.site.page')
sig = hashlib.md5(r.text.encode('utf-8')).hexdigets()
操作浏览器
request 只能获取 html。但是有的操作需要解析 html 并执行 js 脚本。
解决上述问题需要 selenium 模块,它可以直接操作系统中的浏览器,就好像人工在操作一样!
pip install selenium
提示
如果是 firefox 浏览器,可以下载 firebug 附加组件。它能帮我们更加方便地分析网页!
from selenium import webdriver
web = webdriver.Firefox()
web.get('https://www.sina.com.cn')
web.quit()
webdriver 方法 | 说明 |
---|---|
get_window_position() | 获取窗口位置(左上角) |
set_window_position(x,y) | 设置窗口位置(左上角) |
maximize_window() | |
get_window_size() | |
set_window_size(x,y) | |
refresh() | |
back() | 返回上一页 |
forward() | 前往下一页 |
close() | 关闭窗口 |
quit() | 结束浏览器的执行 |
get(url) | 浏览这个网页 |
save_screenshot(filename) | 以 png 格式保存当前浏览器屏幕截图(存储完整的页面截图,不受窗口大小限制) |
current_url | |
page_source | |
title |
# _*_ coding: utf-8 _*_
# 下载页面截图
from selenium import webdriver
urls = [
"https://www.sina.com.cn",
"https://www.sohu.com",
"https://www.eastmoney.com",
"https://www.newone.com.cn",
"https://www.baidu.com"
]
web = webdriver.Firefox()
web.set_window_position(0,0)
web.set_window_size(800,600) # 窗口大小不限制截图大小
i = 0
for url in urls:
web.get(url)
web.save_screenshot("webpage{}.png".format(i))
i = i + 1
web.quit()
不需要 BeautifulSoup, Selenium 的 webdriver 本身就能检索页面的元素
webdirver 方法 | 说明 |
---|---|
find_element(by, value) | 使用 by 方法查询第一个符合 value 的元素 |
find_element_by_class_name(name) | |
find_element_by_css_selector(selector) | |
find_element_by_id(id) | |
find_element_by_link_text(text) | |
find_element_by_name(name) | |
find_element_by_tag_name(name) |
对于找到的页面元素,有以下方法
页面元素方法 | 说明 |
---|---|
clear() | 清除内容 content |
click() | |
is_display() | 是否可见状态 |
is_enabled() | 是否可用状态 |
is_selected() | 是否被选中的状态 |
send_keys(value) | 对此元素送出一串字符,也可以是特定的按键 |
# _*_ coding: utf-8 _*_
# 模拟登录
from selenium import webdriver
web = webdriver.Firefox()
web.get("https://www.jd.com")
web.find_element_by_id('ttbar-login').click()
web.find_element_by_name('loginname').clear()
web.find_element_by_name('loginname').send_keys('your account')
web.find_element_by_name('nloginpwd').clear()
web.find_element_by_name('nloginpwd').send_keys('you password')
web.find_element_by_id('loginsubmit').click()