from __future__ import annotations from collections import defaultdict import re from typing import ( Any, Callable, cast, Dict, Iterator, Iterable, List, Optional, Sequence, Type, Union, ) import warnings from bs4._deprecation import _deprecated from bs4.element import ( AttributeDict, NavigableString, PageElement, ResultSet, Tag, ) from bs4._typing import ( _AtMostOneElement, _AttributeValue, _OneElement, _PageElementMatchFunction, _QueryResults, _RawAttributeValues, _RegularExpressionProtocol, _StrainableAttribute, _StrainableElement, _StrainableString, _StringMatchFunction, _TagMatchFunction, ) class ElementFilter(object): """`ElementFilter` encapsulates the logic necessary to decide: 1. whether a `PageElement` (a `Tag` or a `NavigableString`) matches a user-specified query. 2. whether a given sequence of markup found during initial parsing should be turned into a `PageElement` at all, or simply discarded. The base class is the simplest `ElementFilter`. By default, it matches everything and allows all markup to become `PageElement` objects. You can make it more selective by passing in a user-defined match function, or defining a subclass. Most users of Beautiful Soup will never need to use `ElementFilter`, or its more capable subclass `SoupStrainer`. Instead, they will use methods like :py:meth:`Tag.find`, which will convert their arguments into `SoupStrainer` objects and run them against the tree. However, if you find yourself wanting to treat the arguments to Beautiful Soup's find_*() methods as first-class objects, those objects will be `SoupStrainer` objects. You can create them yourself and then make use of functions like `ElementFilter.filter()`. """ match_function: Optional[_PageElementMatchFunction] def __init__(self, match_function: Optional[_PageElementMatchFunction] = None): """Pass in a match function to easily customize the behavior of `ElementFilter.match` without needing to subclass. :param match_function: A function that takes a `PageElement` and returns `True` if that `PageElement` matches some criteria. """ self.match_function = match_function @property def includes_everything(self) -> bool: """Does this `ElementFilter` obviously include everything? If so, the filter process can be made much faster. The `ElementFilter` might turn out to include everything even if this returns `False`, but it won't include everything in an obvious way. The base `ElementFilter` implementation includes things based on the match function, so includes_everything is only true if there is no match function. """ return not self.match_function @property def excludes_everything(self) -> bool: """Does this `ElementFilter` obviously exclude everything? If so, Beautiful Soup will issue a warning if you try to use it when parsing a document. The `ElementFilter` might turn out to exclude everything even if this returns `False`, but it won't exclude everything in an obvious way. The base `ElementFilter` implementation excludes things based on a match function we can't inspect, so excludes_everything is always false. """ return False def match(self, element: PageElement, _known_rules:bool=False) -> bool: """Does the given PageElement match the rules set down by this ElementFilter? The base implementation delegates to the function passed in to the constructor. :param _known_rules: Defined for compatibility with SoupStrainer._match(). Used more for consistency than because we need the performance optimization. """ if not _known_rules and self.includes_everything: return True if not self.match_function: return True return self.match_function(element) def filter(self, generator: Iterator[PageElement]) -> Iterator[_OneElement]: """The most generic search method offered by Beautiful Soup. Acts like Python's built-in `filter`, using `ElementFilter.match` as the filtering function. """ # If there are no rules at all, don't bother filtering. Let # anything through. if self.includes_everything: for i in generator: yield i while True: try: i = next(generator) except StopIteration: break if i: if self.match(i, _known_rules=True): yield cast("_OneElement", i) def find(self, generator: Iterator[PageElement]) -> _AtMostOneElement: """A lower-level equivalent of :py:meth:`Tag.find`. You can pass in your own generator for iterating over `PageElement` objects. The first one that matches this `ElementFilter` will be returned. :param generator: A way of iterating over `PageElement` objects. """ for match in self.filter(generator): return match return None def find_all( self, generator: Iterator[PageElement], limit: Optional[int] = None ) -> _QueryResults: """A lower-level equivalent of :py:meth:`Tag.find_all`. You can pass in your own generator for iterating over `PageElement` objects. Only elements that match this `ElementFilter` will be returned in the :py:class:`ResultSet`. :param generator: A way of iterating over `PageElement` objects. :param limit: Stop looking after finding this many results. """ results: _QueryResults = ResultSet(self) for match in self.filter(generator): results.append(match) if limit is not None and len(results) >= limit: break return results def allow_tag_creation( self, nsprefix: Optional[str], name: str, attrs: Optional[_RawAttributeValues] ) -> bool: """Based on the name and attributes of a tag, see whether this `ElementFilter` will allow a `Tag` object to even be created. By default, all tags are parsed. To change this, subclass `ElementFilter`. :param name: The name of the prospective tag. :param attrs: The attributes of the prospective tag. """ return True def allow_string_creation(self, string: str) -> bool: """Based on the content of a string, see whether this `ElementFilter` will allow a `NavigableString` object based on this string to be added to the parse tree. By default, all strings are processed into `NavigableString` objects. To change this, subclass `ElementFilter`. :param str: The string under consideration. """ return True class MatchRule(object): """Each MatchRule encapsulates the logic behind a single argument passed in to one of the Beautiful Soup find* methods. """ string: Optional[str] pattern: Optional[_RegularExpressionProtocol] present: Optional[bool] exclude_everything: Optional[bool] # TODO-TYPING: All MatchRule objects also have an attribute # ``function``, but the type of the function depends on the # subclass. def __init__( self, string: Optional[Union[str, bytes]] = None, pattern: Optional[_RegularExpressionProtocol] = None, function: Optional[Callable] = None, present: Optional[bool] = None, exclude_everything: Optional[bool] = None ): if isinstance(string, bytes): string = string.decode("utf8") self.string = string if isinstance(pattern, bytes): self.pattern = re.compile(pattern.decode("utf8")) elif isinstance(pattern, str): self.pattern = re.compile(pattern) else: self.pattern = pattern self.function = function self.present = present self.exclude_everything = exclude_everything values = [ x for x in (self.string, self.pattern, self.function, self.present, self.exclude_everything) if x is not None ] if len(values) == 0: raise ValueError( "Either string, pattern, function, present, or exclude_everything must be provided." ) if len(values) > 1: raise ValueError( "At most one of string, pattern, function, present, and exclude_everything must be provided." ) def _base_match(self, string: Optional[str]) -> Optional[bool]: """Run the 'cheap' portion of a match, trying to get an answer without calling a potentially expensive custom function. :return: True or False if we have a (positive or negative) match; None if we need to keep trying. """ # self.exclude_everything matches nothing. if self.exclude_everything: return False # self.present==True matches everything except None. if self.present is True: return string is not None # self.present==False matches _only_ None. if self.present is False: return string is None # self.string does an exact string match. if self.string is not None: # print(f"{self.string} ?= {string}") return self.string == string # self.pattern does a regular expression search. if self.pattern is not None: # print(f"{self.pattern} ?~ {string}") if string is None: return False return self.pattern.search(string) is not None return None def matches_string(self, string: Optional[str]) -> bool: _base_result = self._base_match(string) if _base_result is not None: # No need to invoke the test function. return _base_result if self.function is not None and not self.function(string): # print(f"{self.function}({string}) == False") return False return True def __repr__(self) -> str: cls = type(self).__name__ return f"<{cls} string={self.string} pattern={self.pattern} function={self.function} present={self.present}>" def __eq__(self, other: Any) -> bool: return ( isinstance(other, MatchRule) and self.string == other.string and self.pattern == other.pattern and self.function == other.function and self.present == other.present ) class TagNameMatchRule(MatchRule): """A MatchRule implementing the rules for matches against tag name.""" function: Optional[_TagMatchFunction] def matches_tag(self, tag: Tag) -> bool: base_value = self._base_match(tag.name) if base_value is not None: return base_value # The only remaining possibility is that the match is determined # by a function call. Call the function. function = cast(_TagMatchFunction, self.function) if function(tag): return True return False class AttributeValueMatchRule(MatchRule): """A MatchRule implementing the rules for matches against attribute value.""" function: Optional[_StringMatchFunction] class StringMatchRule(MatchRule): """A MatchRule implementing the rules for matches against a NavigableString.""" function: Optional[_StringMatchFunction] class SoupStrainer(ElementFilter): """The `ElementFilter` subclass used internally by Beautiful Soup. A `SoupStrainer` encapsulates the logic necessary to perform the kind of matches supported by methods such as :py:meth:`Tag.find`. `SoupStrainer` objects are primarily created internally, but you can create one yourself and pass it in as ``parse_only`` to the `BeautifulSoup` constructor, to parse a subset of a large document. Internally, `SoupStrainer` objects work by converting the constructor arguments into `MatchRule` objects. Incoming tags/markup are matched against those rules. :param name: One or more restrictions on the tags found in a document. :param attrs: A dictionary that maps attribute names to restrictions on tags that use those attributes. :param string: One or more restrictions on the strings found in a document. :param kwargs: A dictionary that maps attribute names to restrictions on tags that use those attributes. These restrictions are additive to any specified in ``attrs``. """ name_rules: List[TagNameMatchRule] attribute_rules: Dict[str, List[AttributeValueMatchRule]] string_rules: List[StringMatchRule] def __init__( self, name: Optional[_StrainableElement] = None, attrs: Dict[str, _StrainableAttribute] = {}, string: Optional[_StrainableString] = None, **kwargs: _StrainableAttribute, ): if string is None and "text" in kwargs: string = cast(Optional[_StrainableString], kwargs.pop("text")) warnings.warn( "As of version 4.11.0, the 'text' argument to the SoupStrainer constructor is deprecated. Use 'string' instead.", DeprecationWarning, stacklevel=2, ) if name is None and not attrs and not string and not kwargs: # Special case for backwards compatibility. Instantiating # a SoupStrainer with no arguments whatsoever gets you one # that matches all Tags, and only Tags. self.name_rules = [TagNameMatchRule(present=True)] else: self.name_rules = cast( List[TagNameMatchRule], list(self._make_match_rules(name, TagNameMatchRule)) ) self.attribute_rules = defaultdict(list) if not isinstance(attrs, dict): # Passing something other than a dictionary as attrs is # sugar for matching that thing against the 'class' # attribute. attrs = {"class": attrs} for attrdict in attrs, kwargs: for attr, value in attrdict.items(): if attr == "class_" and attrdict is kwargs: # If you pass in 'class_' as part of kwargs, it's # because class is a Python reserved word. If you # pass it in as part of the attrs dict, it's # because you really are looking for an attribute # called 'class_'. attr = "class" if value is None: value = False for rule_obj in self._make_match_rules(value, AttributeValueMatchRule): self.attribute_rules[attr].append( cast(AttributeValueMatchRule, rule_obj) ) self.string_rules = cast( List[StringMatchRule], list(self._make_match_rules(string, StringMatchRule)) ) #: DEPRECATED 4.13.0: You shouldn't need to check this under #: any name (.string or .text), and if you do, you're probably #: not taking into account all of the types of values this #: variable might have. Look at the .string_rules list instead. self.__string = string @property def includes_everything(self) -> bool: """Check whether the provided rules will obviously include everything. (They might include everything even if this returns `False`, but not in an obvious way.) """ return not self.name_rules and not self.string_rules and not self.attribute_rules @property def excludes_everything(self) -> bool: """Check whether the provided rules will obviously exclude everything. (They might exclude everything even if this returns `False`, but not in an obvious way.) """ if (self.string_rules and (self.name_rules or self.attribute_rules)): # This is self-contradictory, so the rules exclude everything. return True # If there's a rule that ended up treated as an "exclude everything" # rule due to creating a logical inconsistency, then the rules # exclude everything. if any(x.exclude_everything for x in self.string_rules): return True if any(x.exclude_everything for x in self.name_rules): return True for ruleset in self.attribute_rules.values(): if any(x.exclude_everything for x in ruleset): return True return False @property def string(self) -> Optional[_StrainableString]: ":meta private:" warnings.warn( "Access to deprecated property string. (Look at .string_rules instead) -- Deprecated since version 4.13.0.", DeprecationWarning, stacklevel=2, ) return self.__string @property def text(self) -> Optional[_StrainableString]: ":meta private:" warnings.warn( "Access to deprecated property text. (Look at .string_rules instead) -- Deprecated since version 4.13.0.", DeprecationWarning, stacklevel=2, ) return self.__string def __repr__(self) -> str: return f"<{self.__class__.__name__} name={self.name_rules} attrs={self.attribute_rules} string={self.string_rules}>" @classmethod def _make_match_rules( cls, obj: Optional[Union[_StrainableElement, _StrainableAttribute]], rule_class: Type[MatchRule], ) -> Iterator[MatchRule]: """Convert a vaguely-specific 'object' into one or more well-defined `MatchRule` objects. :param obj: Some kind of object that corresponds to one or more matching rules. :param rule_class: Create instances of this `MatchRule` subclass. """ if obj is None: return if isinstance(obj, (str, bytes)): yield rule_class(string=obj) elif isinstance(obj, bool): yield rule_class(present=obj) elif callable(obj): yield rule_class(function=obj) elif isinstance(obj, _RegularExpressionProtocol): yield rule_class(pattern=obj) elif hasattr(obj, "__iter__"): if not obj: # The attribute is being matched against the null set, # which means it should exclude everything. yield rule_class(exclude_everything=True) for o in obj: if not isinstance(o, (bytes, str)) and hasattr(o, "__iter__"): # This is almost certainly the user's # mistake. This list contains another list, which # opens up the possibility of infinite # self-reference. In the interests of avoiding # infinite recursion, we'll treat this as an # impossible match and issue a rule that excludes # everything, rather than looking inside. warnings.warn( f"Ignoring nested list {o} to avoid the possibility of infinite recursion.", stacklevel=5, ) yield rule_class(exclude_everything=True) continue for x in cls._make_match_rules(o, rule_class): yield x else: yield rule_class(string=str(obj)) def matches_tag(self, tag: Tag) -> bool: """Do the rules of this `SoupStrainer` trigger a match against the given `Tag`? If the `SoupStrainer` has any `TagNameMatchRule`, at least one must match the `Tag` or its `Tag.name`. If there are any `AttributeValueMatchRule` for a given attribute, at least one of them must match the attribute value. If there are any `StringMatchRule`, at least one must match, but a `SoupStrainer` that *only* contains `StringMatchRule` cannot match a `Tag`, only a `NavigableString`. """ # If there are no rules at all, let anything through. #if self.includes_everything: # return True # String rules cannot not match a Tag on their own. if not self.name_rules and not self.attribute_rules: return False # Optimization for a very common case where the user is # searching for a tag with one specific name, and we're # looking at a tag with a different name. if ( not tag.prefix and len(self.name_rules) == 1 and self.name_rules[0].string is not None and tag.name != self.name_rules[0].string ): return False # If there are name rules, at least one must match. It can # match either the Tag object itself or the prefixed name of # the tag. prefixed_name = None if tag.prefix: prefixed_name = f"{tag.prefix}:{tag.name}" if self.name_rules: name_matches = False for rule in self.name_rules: # attrs = " ".join( # [f"{k}={v}" for k, v in sorted(tag.attrs.items())] # ) # print(f"Testing <{tag.name} {attrs}>{tag.string} against {rule}") if rule.matches_tag(tag) or ( prefixed_name is not None and rule.matches_string(prefixed_name) ): name_matches = True break if not name_matches: return False # If there are attribute rules for a given attribute, at least # one of them must match. If there are rules for multiple # attributes, each attribute must have at least one match. for attr, rules in self.attribute_rules.items(): attr_value = tag.get(attr, None) this_attr_match = self._attribute_match(attr_value, rules) if not this_attr_match: return False # If there are string rules, at least one must match. if self.string_rules: _str = tag.string if _str is None: return False if not self.matches_any_string_rule(_str): return False return True def _attribute_match( self, attr_value: Optional[_AttributeValue], rules: Iterable[AttributeValueMatchRule], ) -> bool: attr_values: Sequence[Optional[str]] if isinstance(attr_value, list): attr_values = attr_value else: attr_values = [cast(str, attr_value)] def _match_attribute_value_helper(attr_values: Sequence[Optional[str]]) -> bool: for rule in rules: for attr_value in attr_values: if rule.matches_string(attr_value): return True return False this_attr_match = _match_attribute_value_helper(attr_values) if not this_attr_match and len(attr_values) > 1: # This cast converts Optional[str] to plain str. # # We know if there's more than one value, there can't be # any None in the list, because Beautiful Soup never uses # None as a value of a multi-valued attribute, and if None # is passed in as attr_value, it's turned into a list with # a single element (thus len(attr_values) > 1 fails). attr_values = cast(Sequence[str], attr_values) # Try again but treat the attribute value # as a single string. joined_attr_value = " ".join(attr_values) this_attr_match = _match_attribute_value_helper([joined_attr_value]) return this_attr_match def allow_tag_creation( self, nsprefix: Optional[str], name: str, attrs: Optional[_RawAttributeValues] ) -> bool: """Based on the name and attributes of a tag, see whether this `SoupStrainer` will allow a `Tag` object to even be created. :param name: The name of the prospective tag. :param attrs: The attributes of the prospective tag. """ if self.string_rules: # A SoupStrainer that has string rules can't be used to # manage tag creation, because the string rule can't be # evaluated until after the tag and all of its contents # have been parsed. return False prefixed_name = None if nsprefix: prefixed_name = f"{nsprefix}:{name}" if self.name_rules: # At least one name rule must match. name_match = False for rule in self.name_rules: for x in name, prefixed_name: if x is not None: if rule.matches_string(x): name_match = True break if not name_match: return False # For each attribute that has rules, at least one rule must # match. if attrs is None: attrs = AttributeDict() for attr, rules in self.attribute_rules.items(): attr_value = attrs.get(attr) if not self._attribute_match(attr_value, rules): return False return True def allow_string_creation(self, string: str) -> bool: """Based on the content of a markup string, see whether this `SoupStrainer` will allow it to be instantiated as a `NavigableString` object, or whether it should be ignored. """ if self.name_rules or self.attribute_rules: # A SoupStrainer that has name or attribute rules won't # match any strings; it's designed to match tags with # certain properties. return False if not self.string_rules: # A SoupStrainer with no string rules will match # all strings. return True if not self.matches_any_string_rule(string): return False return True def matches_any_string_rule(self, string: str) -> bool: """See whether the content of a string matches any of this `SoupStrainer`'s string rules. """ if not self.string_rules: return True for string_rule in self.string_rules: if string_rule.matches_string(string): return True return False def match(self, element: PageElement, _known_rules: bool=False) -> bool: """Does the given `PageElement` match the rules set down by this `SoupStrainer`? The find_* methods rely heavily on this method to find matches. :param element: A `PageElement`. :param _known_rules: Set to true in the common case where we already checked and found at least one rule in this SoupStrainer that might exclude a PageElement. Without this, we need to check .includes_everything every time, just to be safe. :return: `True` if the element matches this `SoupStrainer`'s rules; `False` otherwise. """ # If there are no rules at all, let anything through. if not _known_rules and self.includes_everything: return True if isinstance(element, Tag): return self.matches_tag(element) assert isinstance(element, NavigableString) if not (self.name_rules or self.attribute_rules): # A NavigableString can only match a SoupStrainer that # does not define any name or attribute rules. # Then it comes down to the string rules. return self.matches_any_string_rule(element) return False @_deprecated("allow_tag_creation", "4.13.0") def search_tag(self, name: str, attrs: Optional[_RawAttributeValues]) -> bool: """A less elegant version of `allow_tag_creation`. Deprecated as of 4.13.0""" ":meta private:" return self.allow_tag_creation(None, name, attrs) @_deprecated("match", "4.13.0") def search(self, element: PageElement) -> Optional[PageElement]: """A less elegant version of match(). Deprecated as of 4.13.0. :meta private: """ return element if self.match(element) else None