diff options
author | Patrik Teivonen <patrik.teivonen@qt.io> | 2022-10-25 13:31:12 +0300 |
---|---|---|
committer | Patrik Teivonen <patrik.teivonen@qt.io> | 2022-12-29 14:55:12 +0000 |
commit | 4e3ab8c3ce2973850a9bbcafb0636c386db97a45 (patch) | |
tree | 02fbbbf10116973896d79e5034f6d3c440f74d7e | |
parent | 5e0b0bdbf08df923f2b74e1c8d6d29e90663142c (diff) |
Make resolve_wildcard_uri() crawl urls asynchronously
Speed up the uri resolvation by resolving child dirs concurrently.
Task-number: QTBUG-105693
Change-Id: I44fadc6de4097f43c57a5dd6df275bf94e0fc5e3
Reviewed-by: Iikka Eklund <iikka.eklund@qt.io>
-rw-r--r-- | packaging-tools/sdkcomponent.py | 35 | ||||
-rw-r--r-- | packaging-tools/tests/test_sdkcomponent.py | 8 |
2 files changed, 35 insertions, 8 deletions
diff --git a/packaging-tools/sdkcomponent.py b/packaging-tools/sdkcomponent.py index ddc58cd25..be013a9a2 100644 --- a/packaging-tools/sdkcomponent.py +++ b/packaging-tools/sdkcomponent.py @@ -29,13 +29,15 @@ # ############################################################################# +import asyncio import os import re +import sys from configparser import ConfigParser from dataclasses import dataclass, field from fnmatch import fnmatch from pathlib import Path -from typing import Dict, List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple import htmllistparse # type: ignore from urlpath import URL # type: ignore @@ -43,6 +45,12 @@ from urlpath import URL # type: ignore from bldinstallercommon import uri_exists from logging_util import init_logger +if sys.version_info < (3, 7): + from asyncio_backport import run as asyncio_run +else: + from asyncio import run as asyncio_run + + log = init_logger(__name__, debug_mode=False) @@ -228,7 +236,16 @@ class ArchiveResolver: return self.file_share_base_url.rstrip("/") + "/" + url.lstrip("/") return url - def resolve_uri_pattern(self, pattern: str, base_url: Optional[URL] = None) -> List[URL]: + async def fetch_in_executor(self, url: str) -> Tuple[Any, List[Any]]: + """Wrap fetch_listing in a Future and return it""" + if sys.version_info < (3, 7): + loop = asyncio.get_event_loop() # keep for Python 3.6 compatibility + else: + loop = asyncio.get_running_loop() + log.info("Crawl: %s", url) + return await loop.run_in_executor(None, htmllistparse.fetch_listing, url, 30) + + async def resolve_uri_pattern(self, pattern: str, base_url: Optional[URL] = None) -> List[URL]: """ Return payload URIs from remote tree, fnmatch pattern match for given arguments. Patterns will match arbitrary number of '/' allowing recursive search. @@ -245,19 +262,23 @@ class ArchiveResolver: # base_url from base_pattern if not specified base_url = base_url or URL(base_pattern.rsplit("/", 1)[0]) # get links from base_url - log.info("Crawl: %s", base_url) - links = htmllistparse.fetch_listing(base_url, timeout=30)[1] + _, links = await self.fetch_in_executor(base_url) # get fnmatch pattern matches from links recursively uri_list = [] + child_list = [] for link in links: if link.name.endswith("/"): # match the directory with base_pattern if fnmatch(base_url / link.name, base_pattern + "*"): - # recursively look for pattern matches inside the matching directory - uri_list.extend(self.resolve_uri_pattern(pattern, base_url / link.name)) + child_list.append(base_url / link.name) else: if fnmatch(base_url / link.name, pattern): uri_list.append(base_url / link.name) + # recursively look for pattern matches inside the matching child directories + coros = [self.resolve_uri_pattern(pattern, url) for url in child_list] + results = await asyncio.gather(*coros) + for item in results: + uri_list.extend(item) return uri_list def resolve_payload_uri(self, unresolved_archive_uri: str) -> List[str]: @@ -278,7 +299,7 @@ class ArchiveResolver: # is it a URL containing a fnmatch pattern if any(char in unresolved_archive_uri for char in ("*", "[", "]", "?")): pattern = self.absolute_url(unresolved_archive_uri) - return [str(url) for url in self.resolve_uri_pattern(pattern)] + return [str(url) for url in asyncio_run(self.resolve_uri_pattern(pattern))] # is it a file system path or an absolute URL which can be downloaded if os.path.exists(unresolved_archive_uri) or URL(unresolved_archive_uri).netloc: return [unresolved_archive_uri] diff --git a/packaging-tools/tests/test_sdkcomponent.py b/packaging-tools/tests/test_sdkcomponent.py index 49ed97144..8bc6b6f49 100644 --- a/packaging-tools/tests/test_sdkcomponent.py +++ b/packaging-tools/tests/test_sdkcomponent.py @@ -29,6 +29,7 @@ ############################################################################# import os +import sys import tempfile import unittest from configparser import ConfigParser, ExtendedInterpolation @@ -47,6 +48,11 @@ from sdkcomponent import ( parse_ifw_sdk_comp, ) +if sys.version_info < (3, 7): + import asyncio_backport as asyncio +else: + import asyncio + def ifw_sdk_config_valid(section_name: str) -> ConfigParser: conf = ConfigParser(interpolation=ExtendedInterpolation()) @@ -334,7 +340,7 @@ class TestRunner(unittest.TestCase): @unittest.mock.patch("htmllistparse.fetch_listing", side_effect=create_listing) # type: ignore def test_pattern_archive_resolver(self, pattern: str, expected: List[str], _: Any) -> None: resolver = ArchiveResolver("", "") - self.assertCountEqual(resolver.resolve_uri_pattern(pattern, None), expected) + self.assertCountEqual(asyncio.run(resolver.resolve_uri_pattern(pattern, None)), expected) def test_locate_pkg_templ_dir_invalid(self) -> None: with tempfile.TemporaryDirectory(dir=os.getcwd()) as tmp_base_dir: |