scrapy 浅分析(二)

0x00.前言

前面提到了,所以目前要开始从crawl说起,也即cmdline.py文件里面的如下代码

1
2
3
cmd.crawler_process = CrawlerProcess(settings)
_run_print_help(parser, _run_command, cmd, args, opts)
sys.exit(cmd.exitcode)

所以先进入crawler.py文件找到CrawlerProcess类。

0x01.进入引擎前奏

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
class CrawlerProcess(CrawlerRunner):
"""
#一个进程运行多个scrapy爬虫
A class to run multiple scrapy crawlers in a process simultaneously.
#继承crawlerRunner类,加上了停止机制,还有一个异步机制
This class extends :class:`~scrapy.crawler.CrawlerRunner` by adding support
for starting a Twisted `reactor`_ and handling shutdown signals, like the
keyboard interrupt command Ctrl-C. It also configures top-level logging.

This utility should be a better fit than
:class:`~scrapy.crawler.CrawlerRunner` if you aren't running another
Twisted `reactor`_ within your application.
#需要导入一份配置
The CrawlerProcess object must be instantiated with a
:class:`~scrapy.settings.Settings` object.

This class shouldn't be needed (since Scrapy is responsible of using it
accordingly) unless writing scripts that manually handle the crawling
process. See :ref:`run-from-script` for an example.
"""

def __init__(self, settings=None):
#调用父类并初始化,请注意这里是并,父类要运行
super(CrawlerProcess, self).__init__(settings)
install_shutdown_handlers(self._signal_shutdown)
configure_logging(self.settings)
log_scrapy_info(self.settings)

关于那个异步机制,这里有介绍https://www.cnblogs.com/LittleHann/p/5232318.html?utm_source=tuicool&utm_medium=referral

这不是重点,只要知道爬虫这里用到这个异步机制就好了。父类CrawlerRunner初始化如下:

1
2
3
4
5
6
7
def __init__(self, settings=None):
if isinstance(settings, dict) or settings is None:
settings = Settings(settings)
self.settings = settings
self.spider_loader = _get_spider_loader(settings)
self._crawlers = set()
self._active = set()

这里调用了_get_spider_loader函数,跟进得:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
def _get_spider_loader(settings):
""" Get SpiderLoader instance from settings """
if settings.get('SPIDER_MANAGER_CLASS'):#实际上我对应的没有这个
warnings.warn(
'SPIDER_MANAGER_CLASS option is deprecated. '
'Please use SPIDER_LOADER_CLASS.',
category=ScrapyDeprecationWarning, stacklevel=2
)
cls_path = settings.get('SPIDER_MANAGER_CLASS',
settings.get('SPIDER_LOADER_CLASS'))
loader_cls = load_object(cls_path)
try:
verifyClass(ISpiderLoader, loader_cls)
except DoesNotImplement:
warnings.warn(
'SPIDER_LOADER_CLASS (previously named SPIDER_MANAGER_CLASS) does '
'not fully implement scrapy.interfaces.ISpiderLoader interface. '
'Please add all missing methods to avoid unexpected runtime errors.',
category=ScrapyDeprecationWarning, stacklevel=2
)
return loader_cls.from_settings(settings.frozencopy())

按函数名字,这里是要加载配置文件里面一些东西,settings.get('SPIDER_LOADER_CLAS')而默认配置文件里面有这么一句SPIDER_LOADER_CLASS = 'scrapy.spiderloader.SpiderLoader',得嘞跟进去spiderloader.py文件找到相应类:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
@implementer(ISpiderLoader)
class SpiderLoader(object):
"""
#加载所有爬虫
SpiderLoader is a class which locates and loads spiders
in a Scrapy project.
"""
def __init__(self, settings):
self.spider_modules = settings.getlist('SPIDER_MODULES')
self.warn_only = settings.getbool('SPIDER_LOADER_WARN_ONLY')
self._spiders = {}#字典
self._found = defaultdict(list)
self._load_all_spiders()

def _check_name_duplicates(self):
dupes = ["\n".join(" {cls} named {name!r} (in {module})".format(
module=mod, cls=cls, name=name)
for (mod, cls) in locations)
for name, locations in self._found.items()
if len(locations)>1]
if dupes:
msg = ("There are several spiders with the same name:\n\n"
"{}\n\n This can cause unexpected behavior.".format(
"\n\n".join(dupes)))
warnings.warn(msg, UserWarning)

def _load_spiders(self, module):
#组成字典
for spcls in iter_spider_classes(module):
self._found[spcls.name].append((module.__name__, spcls.__name__))
self._spiders[spcls.name] = spcls

def _load_all_spiders(self):
for name in self.spider_modules:
try:
for module in walk_modules(name):
self._load_spiders(module)
except ImportError as e:
if self.warn_only:
msg = ("\n{tb}Could not load spiders from module '{modname}'. "
"See above traceback for details.".format(
modname=name, tb=traceback.format_exc()))
warnings.warn(msg, RuntimeWarning)
else:
raise
self._check_name_duplicates()

如类名字和注释一样,加载所有的爬虫,类似于前面的commands加载,组成对应字典。好的,回到CrawlerRunner继续:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
def crawl(self, crawler_or_spidercls, *args, **kwargs):
"""
Run a crawler with the provided arguments.

It will call the given Crawler's :meth:`~Crawler.crawl` method, while
keeping track of it so it can be stopped later.
#非类而是一个实例
If `crawler_or_spidercls` isn't a :class:`~scrapy.crawler.Crawler`
instance, this method will try to create one using this parameter as
the spider class given to it.

Returns a deferred that is fired when the crawling is finished.

:param crawler_or_spidercls: already created crawler, or a spider class
or spider's name inside the project to create it
:type crawler_or_spidercls: :class:`~scrapy.crawler.Crawler` instance,
:class:`~scrapy.spiders.Spider` subclass or string

:param list args: arguments to initialize the spider

:param dict kwargs: keyword arguments to initialize the spider
"""
#创建crawler实例
crawler = self.create_crawler(crawler_or_spidercls)
return self._crawl(crawler, *args, **kwargs)

def _crawl(self, crawler, *args, **kwargs):
self.crawlers.add(crawler)
d = crawler.crawl(*args, **kwargs)
self._active.add(d)

def _done(result):
self.crawlers.discard(crawler)
self._active.discard(d)
return result

return d.addBoth(_done)

def create_crawler(self, crawler_or_spidercls):
"""
Return a :class:`~scrapy.crawler.Crawler` object.

* If `crawler_or_spidercls` is a Crawler, it is returned as-is.
* If `crawler_or_spidercls` is a Spider subclass, a new Crawler
is constructed for it.
* If `crawler_or_spidercls` is a string, this function finds
a spider with this name in a Scrapy project (using spider loader),
then creates a Crawler instance for it.
"""
if isinstance(crawler_or_spidercls, Crawler):
return crawler_or_spidercls
return self._create_crawler(crawler_or_spidercls)

def _create_crawler(self, spidercls):
if isinstance(spidercls, six.string_types):
spidercls = self.spider_loader.load(spidercls)
#OK跟进Crawler类
return Crawler(spidercls, self.settings)

def stop(self):
"""
Stops simultaneously all the crawling jobs taking place.

Returns a deferred that is fired when they all have ended.
"""
return defer.DeferredList([c.stop() for c in list(self.crawlers)])

@defer.inlineCallbacks
def join(self):
"""
join()

Returns a deferred that is fired when all managed :attr:`crawlers` have
completed their executions.
"""
while self._active:
yield defer.DeferredList(self._active)

跟进Crawler类,这个类很关键:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
class Crawler(object):

def __init__(self, spidercls, settings=None):
if isinstance(settings, dict) or settings is None:
settings = Settings(settings)

self.spidercls = spidercls
self.settings = settings.copy()
self.spidercls.update_settings(self.settings)

self.signals = SignalManager(self)
self.stats = load_object(self.settings['STATS_CLASS'])(self)

handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL'))
logging.root.addHandler(handler)
if get_scrapy_root_handler() is not None:
# scrapy root handler alread installed: update it with new settings
install_scrapy_root_handler(self.settings)
# lambda is assigned to Crawler attribute because this way it is not
# garbage collected after leaving __init__ scope
self.__remove_handler = lambda: logging.root.removeHandler(handler)
self.signals.connect(self.__remove_handler, signals.engine_stopped)

lf_cls = load_object(self.settings['LOG_FORMATTER'])
self.logformatter = lf_cls.from_crawler(self)
self.extensions = ExtensionManager.from_crawler(self)

self.settings.freeze()
self.crawling = False
self.spider = None
self.engine = None

@property
def spiders(self):
if not hasattr(self, '_spiders'):
warnings.warn("Crawler.spiders is deprecated, use "
"CrawlerRunner.spider_loader or instantiate "
"scrapy.spiderloader.SpiderLoader with your "
"settings.",
category=ScrapyDeprecationWarning, stacklevel=2)
self._spiders = _get_spider_loader(self.settings.frozencopy())
return self._spiders

@defer.inlineCallbacks
def crawl(self, *args, **kwargs):
assert not self.crawling, "Crawling already taking place"
self.crawling = True

try:
#真正建立爬虫实例
self.spider = self._create_spider(*args, **kwargs)
#建立引擎
self.engine = self._create_engine()
#拿到对应spider的start_request()方法
start_requests = iter(self.spider.start_requests())
#执行引擎的open_spider方法,传入刚刚的爬虫实例和方法请求
yield self.engine.open_spider(self.spider, start_requests)
yield defer.maybeDeferred(self.engine.start)
except Exception:
# In Python 2 reraising an exception after yield discards
# the original traceback (see http://bugs.python.org/issue7563),
# so sys.exc_info() workaround is used.
# This workaround also works in Python 3, but it is not needed,
# and it is slower, so in Python 3 we use native `raise`.
if six.PY2:
exc_info = sys.exc_info()

self.crawling = False
if self.engine is not None:
yield self.engine.close()

if six.PY2:
six.reraise(*exc_info)
raise

def _create_spider(self, *args, **kwargs):
return self.spidercls.from_crawler(self, *args, **kwargs)
#进入引擎,事情交给它了
def _create_engine(self):
return ExecutionEngine(self, lambda _: self.stop())

@defer.inlineCallbacks
def stop(self):
if self.crawling:
self.crawling = False
yield defer.maybeDeferred(self.engine.stop)

crawer.py文件余下代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
def _signal_shutdown(self, signum, _):
install_shutdown_handlers(self._signal_kill)
signame = signal_names[signum]
logger.info("Received %(signame)s, shutting down gracefully. Send again to force ",
{'signame': signame})
reactor.callFromThread(self._graceful_stop_reactor)

def _signal_kill(self, signum, _):
install_shutdown_handlers(signal.SIG_IGN)
signame = signal_names[signum]
logger.info('Received %(signame)s twice, forcing unclean shutdown',
{'signame': signame})
reactor.callFromThread(self._stop_reactor)

def start(self, stop_after_crawl=True):
"""
This method starts a Twisted `reactor`_, adjusts its pool size to
:setting:`REACTOR_THREADPOOL_MAXSIZE`, and installs a DNS cache based
on :setting:`DNSCACHE_ENABLED` and :setting:`DNSCACHE_SIZE`.

If `stop_after_crawl` is True, the reactor will be stopped after all
crawlers have finished, using :meth:`join`.

:param boolean stop_after_crawl: stop or not the reactor when all
crawlers have finished
"""
if stop_after_crawl:
d = self.join()
# Don't start the reactor if the deferreds are already fired
if d.called:
return
d.addBoth(self._stop_reactor)
#开始运行
reactor.installResolver(self._get_dns_resolver())
tp = reactor.getThreadPool()
tp.adjustPoolsize(maxthreads=self.settings.getint('REACTOR_THREADPOOL_MAXSIZE'))
reactor.addSystemEventTrigger('before', 'shutdown', self.stop)
reactor.run(installSignalHandlers=False) # blocking call

0x02.小结

上回说到创建crawlerprocess实例,该实例中主要加载对应的配置文件(找到用户写的爬虫类位置),同时加载用户自己写的爬虫类并实例化,最后交给引擎处理。等于说到这里我们刚进行了官方构建那张图的第一步:将start_request交给引擎处理。结合上一篇,话说也应该如此,你要得到有哪些命令才能让程序明白你写的命令啊,你要得到有哪些爬虫才能让程序读懂你此时要运行那个爬虫啊!!!下次进入引擎看看其如何调度。

文章目录
  1. 1. 0x00.前言
  2. 2. 0x01.进入引擎前奏
  3. 3. 0x02.小结