element = driver.find_element_by_id("passwd-id") element = driver.find_element_by_name("passwd") element = driver.find_element_by_xpath("//input[@id='passwd-id']")
拿到了输入框,可以继续操作浏览器输入一些数据,还可以通过”Keys”类来模式输入方向键
1
element.send_keys(" and some", Keys.ARROW_DOWN) # ARROW_DOWN下箭头
# 隐式等待 # 如果某些元素不是立即可用的,隐式等待是告诉WebDriver去等待一定的时间后去查找元素。 默认等待时间是0秒 driver.implicitly_wait(10) # 显式等待 # 在代码中定义的,等待一定条件发生后,再进一步执行你的代码 from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC
html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="title"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """
# contents和children都是获取子节点,descendants是一个生成器,遍历可以获取所有子孙节点 for child in tag.descendants: print(child) # 输出结果: # <b>The Dormouse's story</b> # The Dormouse's story # 如果tag只有一个NavigableString类型子节点(或者仅有一个子节点),那么这个tag可以使用string: print(tag.string) # 如果tag包含了多个子节点,tag就无法确定string方法应该调用哪个子节点的内容,string的输出结果是 None # 此时可以使用strings 拿到一个生成器对象,for循环取到tag下所有的文本内容 # 输出的字符串中可能包含了很多空格或空行,使用stripped_strings可以去除多余空白内容 for string in soup.strings: print(string) for string in soup.stripped_strings: print(string)
classCrawl01DownloaderMiddleware: # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects.
defprocess_request(self, request, spider): # Called for each request that goes through the downloader middleware. # 每次请求通过时,执行该方法
# Must either: # 返回值必须是其中一个 # - return None: continue processing this request. # If it returns None, Scrapy will continue processing this request, executing all other middlewares until, finally, the appropriate downloader handler is called the request performed (and its response downloaded). # - or return a Response object. # If it returns a Response object, Scrapy won’t bother calling any other process_request() or process_exception() methods, or the appropriate download function; it’ll return that response. The process_response() methods of installed middleware is always called on every response. # - or return a Request object. # If it returns a Request object, Scrapy will stop calling process_request methods and reschedule the returned request. Once the newly returned request is performed, the appropriate middleware chain will be called on the downloaded response. # - or raise IgnoreRequest:process_exception() methods of installed downloader middleware will be called. # If it raises an IgnoreRequest exception, the process_exception() methods of installed downloader middleware will be called. If none of them handle the exception, the errback function of the request (Request.errback) is called. If no code handles the raised exception, it is ignored and not logged (unlike other exceptions). returnNone
defprocess_response(self, request, response, spider): # Called with the response returned from the downloader. # 每次响应通过时,执行该方法
# Must either; # 返回值必须是其中一个 # - return a Response object. # If it returns a Response (it could be the same given response, or a brand-new one), that response will continue to be processed with the process_response() of the next middleware in the chain. # - return a Request object. # If it returns a Request object, the middleware chain is halted and the returned request is rescheduled to be downloaded in the future. This is the same behavior as if a request is returned from process_request(). # - or raise IgnoreRequest. # If it raises an IgnoreRequest exception, the errback function of the request (Request.errback) is called. If no code handles the raised exception, it is ignored and not logged (unlike other exceptions). return response
defprocess_exception(self, request, exception, spider): # Called when a download handler or a process_request() (from other downloader middleware) raises an exception. # 当其他中间件引发异常时,调用此方法
# Must either: # 返回值必须是其中一个 # - return None: continue processing this exception. # If it returns None, Scrapy will continue processing this exception, executing any other process_exception() methods of installed middleware, until no middleware is left and the default exception handling kicks in. # - return a Response object: stops process_exception() chain. # If it returns a Response object, the process_response() method chain of installed middleware is started, and Scrapy won’t bother calling any other process_exception() methods of middleware. # - return a Request object: stops process_exception() chain. # If it returns a Request object, the returned request is rescheduled to be downloaded in the future. This stops the execution of process_exception() methods of the middleware the same as returning a response would. pass
# Ensure all spiders share same duplicates filter through redis. DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"# 使用scrapy_redis的去重
# Enables stats shared based on Redis STATS_CLASS = "scrapy_redis.stats.RedisStatsCollector"
# Default requests serializer is pickle, but it can be changed to any module # with loads and dumps functions. Note that pickle is not compatible between # python versions. # Caveat: In python 3.x, the serializer must return strings keys and support # bytes as values. Because of this reason the json or msgpack module will not # work by default. In python 2.x there is no such issue and you can use # 'json' or 'msgpack' as serializers. #SCHEDULER_SERIALIZER = "scrapy_redis.picklecompat"
# Schedule requests using a priority queue. (default) #SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue'
# Alternative queues. #SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.FifoQueue' #SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.LifoQueue'
# Max idle time to prevent the spider from being closed when distributed crawling. # This only works if queue class is SpiderQueue or SpiderStack, # and may also block the same time when your spider start at the first time (because the queue is empty). #SCHEDULER_IDLE_BEFORE_CLOSE = 10
# Maximum idle time before close spider. # When the number of idle seconds is greater than MAX_IDLE_TIME_BEFORE_CLOSE, the crawler will close. # If 0, the crawler will DontClose forever to wait for the next request. # If negative number, the crawler will immediately close when the queue is empty, just like Scrapy. #MAX_IDLE_TIME_BEFORE_CLOSE = 0
# Store scraped item in redis for post-processing. ITEM_PIPELINES = { 'scrapy_redis.pipelines.RedisPipeline': 300 # 管道配置成scrapy_redis的,所有的数据会在redis存放一份,可以不配置 # 不配置也行,用自己的,存放在本地(或者mysql) }
# The item pipeline serializes and stores the items in this redis key. #REDIS_ITEMS_KEY = '%(spider)s:items'
# The items serializer is by default ScrapyJSONEncoder. You can use any # importable path to a callable object. #REDIS_ITEMS_SERIALIZER = 'json.dumps'
# Specify the host and port to use when connecting to Redis (optional). # 指定连接到Redis时使用的主机和端口 REDIS_HOST = 'localhost' REDIS_PORT = 6379
# Specify the full Redis URL for connecting (optional). # If set, this takes precedence over the REDIS_HOST and REDIS_PORT settings. # 完整的redis连接地址,如果配置了,会优先使用 #REDIS_URL = 'redis://user:pass@hostname:9001'
# If True, it uses redis' ``SPOP`` operation. You have to use the ``SADD`` # command to add URLs to the redis queue. This could be useful if you # want to avoid duplicates in your start urls list and the order of # processing does not matter. #REDIS_START_URLS_AS_SET = False
# If True, it uses redis ``zrevrange`` and ``zremrangebyrank`` operation. You have to use the ``zadd`` # command to add URLS and Scores to redis queue. This could be useful if you # want to use priority and avoid duplicates in your start urls list. #REDIS_START_URLS_AS_ZSET = False
# Default start urls key for RedisSpider and RedisCrawlSpider. #REDIS_START_URLS_KEY = '%(name)s:start_urls'
# Use other encoding than utf-8 for redis. # 如果要让redis使用除utf-8其他的编码,配置这个参数 #REDIS_ENCODING = 'latin1'