From deddba00c3094e0af11aa7d3b9c4f20a31d1b43f Mon Sep 17 00:00:00 2001 From: daijro Date: Fri, 11 Oct 2024 21:36:19 -0500 Subject: [PATCH] Allow passing in a region or language into locale 0.2.13 - Passing in a region like `locale='US'` will generate a language based on the region, "US". - Passing in a language like `locale='en'` will generate a region based on the language, "en". - Fix locale parsing errors when generating using territoryInfo.xml #36 --- pythonlib/camoufox/exceptions.py | 23 ++++- pythonlib/camoufox/locale.py | 159 ++++++++++++++++++++++--------- pythonlib/camoufox/utils.py | 4 +- pythonlib/camoufox/warnings.yml | 4 + pythonlib/pyproject.toml | 2 +- 5 files changed, 142 insertions(+), 50 deletions(-) diff --git a/pythonlib/camoufox/exceptions.py b/pythonlib/camoufox/exceptions.py index 9a02abe..21db403 100644 --- a/pythonlib/camoufox/exceptions.py +++ b/pythonlib/camoufox/exceptions.py @@ -86,7 +86,20 @@ class UnknownIPLocation(LocaleError): ... -class UnknownTerritory(LocaleError): +class InvalidLocale(LocaleError): + """ + Raised when the locale input is invalid. + """ + + @classmethod + def invalid_input(cls, locale: str) -> 'InvalidLocale': + return cls( + f"Invalid locale: '{locale}'. Must be either a region, language, " + "language-region, or language-script-region." + ) + + +class UnknownTerritory(InvalidLocale): """ Raised when the territory is unknown. """ @@ -94,6 +107,14 @@ class UnknownTerritory(LocaleError): ... +class UnknownLanguage(InvalidLocale): + """ + Raised when the language is unknown. + """ + + ... + + class NotInstalledGeoIPExtra(ImportError): """ Raised when the geoip2 module is not installed. diff --git a/pythonlib/camoufox/locale.py b/pythonlib/camoufox/locale.py index 1105967..cfb10ed 100644 --- a/pythonlib/camoufox/locale.py +++ b/pythonlib/camoufox/locale.py @@ -7,8 +7,15 @@ import numpy as np from language_tags import tags from camoufox.pkgman import LOCAL_DATA, rprint, webdl +from camoufox.warnings import LeakWarning -from .exceptions import NotInstalledGeoIPExtra, UnknownIPLocation, UnknownTerritory +from .exceptions import ( + InvalidLocale, + NotInstalledGeoIPExtra, + UnknownIPLocation, + UnknownLanguage, + UnknownTerritory, +) from .ip import validate_ip try: @@ -90,9 +97,7 @@ def verify_locales(locales: List[str]) -> None: for loc in locales: if tags.check(loc): continue - raise ValueError( - f"Invalid locale: '{loc}'. All locales must be in the format of language[-script][-region]" - ) + raise InvalidLocale.invalid_input(loc) def normalize_locale(locale: str) -> Locale: @@ -107,7 +112,7 @@ def normalize_locale(locale: str) -> Locale: # Parse the locale parser = tags.tag(locale) if not parser.region: - raise ValueError(f"Invalid locale: {locale}. Region is required.") + raise InvalidLocale.invalid_input(locale) record = parser.language.data['record'] @@ -119,6 +124,29 @@ def normalize_locale(locale: str) -> Locale: ) +def handle_locale(locale: str) -> Locale: + """ + Handles a locale input, normalizing it if necessary. + """ + if len(locale) > 3: + return normalize_locale(locale) + + try: + return SELECTOR.from_region(locale) + except UnknownTerritory: + pass + + try: + language = SELECTOR.from_language(locale) + except UnknownLanguage: + pass + else: + LeakWarning.warn('no_region') + return language + + raise InvalidLocale.invalid_input(locale) + + """ Helpers to fetch geolocation, timezone, and locale data given an IP. """ @@ -176,7 +204,7 @@ def get_geolocation(ip: str) -> Geolocation: with geoip2.database.Reader(str(MMDB_FILE)) as reader: resp = reader.city(ip) - iso_code = cast(str, resp.registered_country.iso_code) + iso_code = cast(str, resp.registered_country.iso_code).upper() location = resp.location # Check if any required attributes are missing @@ -184,8 +212,7 @@ def get_geolocation(ip: str) -> Geolocation: raise UnknownIPLocation(f"Unknown IP location: {ip}") # Get a statistically correct locale based on the country code - locale_finder = GetLocaleFromTerritory(iso_code) - locale = locale_finder.get_locale() + locale = SELECTOR.from_region(iso_code) return Geolocation( locale=locale, @@ -211,60 +238,100 @@ def get_unicode_info() -> ET.Element: return data -class GetLocaleFromTerritory: +def _as_float(element: ET.Element, attr: str) -> float: """ - Calculates a random language based on the territory code, - based on the probability that a person speaks the language in the territory. + Converts an attribute to a float. + """ + return float(element.get(attr, 0)) + + +class StatisticalLocaleSelector: + """ + Selects a random locale based on statistical data. + Takes either a territory code or a language code, and generates a Locale object. """ - def __init__(self, iso_code: str): - self.iso_code = iso_code.upper() + def __init__(self): self.root = get_unicode_info() - self.languages, self.probabilities = self._load_territory_data() - - def _load_territory_data(self) -> Tuple[np.ndarray, np.ndarray]: - territory = self.root.find(f"territory[@type='{self.iso_code}']") + def _load_territory_data(self, iso_code: str) -> Tuple[np.ndarray, np.ndarray]: + """ + Calculates a random language based on the territory code, + based on the probability that a person speaks the language in the territory. + """ + territory = self.root.find(f"territory[@type='{iso_code}']") if territory is None: - raise UnknownTerritory(f"Unknown territory: {self.iso_code}") + raise UnknownTerritory(f"Unknown territory: {iso_code}") - lang_population = territory.findall('languagePopulation') + lang_populations = territory.findall('languagePopulation') + if not lang_populations: + raise ValueError(f"No language data found for region: {iso_code}") - if not lang_population: - raise ValueError(f"No language data found for territory: {self.iso_code}") + languages = np.array([lang.get('type') for lang in lang_populations]) + percentages = np.array([_as_float(lang, 'populationPercent') for lang in lang_populations]) - # Use list comprehension for faster data extraction - languages = np.array([lang.get('type') for lang in lang_population]) - percentages = np.array( - [float(lang.get('populationPercent', '0')) for lang in lang_population] - ) + return self.normalize_probabilities(languages, percentages) - # Normalize probabilities - total = np.sum(percentages) - probabilities = percentages / total - - return languages, probabilities - - def get_random_language(self) -> str: + def _load_language_data(self, language: str) -> Tuple[np.ndarray, np.ndarray]: """ - Get a random language based on the territory ISO code. + Calculates a random region for a language + based on the total speakers of the language in that region. """ - return np.random.choice(self.languages, p=self.probabilities) + territories = self.root.findall(f'.//territory/languagePopulation[@type="{language}"]/..') + if not territories: + raise UnknownLanguage(f"No region data found for language: {language}") - def get_locale(self) -> Locale: + regions = [] + percentages = [] + + for terr in territories: + region = terr.get('type') + if region is None: + continue # Skip if region is not found + + lang_pop = terr.find(f'languagePopulation[@type="{language}"]') + if lang_pop is None: + continue # This shouldn't happen due to our XPath, but just in case + + regions.append(region) + percentages.append( + _as_float(lang_pop, 'populationPercent') + * _as_float(terr, 'literacyPercent') + / 10_000 + * _as_float(terr, 'population') + ) + + if not regions: + raise ValueError(f"No valid region data found for language: {language}") + + return self.normalize_probabilities(np.array(regions), np.array(percentages)) + + def normalize_probabilities( + self, languages: np.ndarray, freq: np.ndarray + ) -> Tuple[np.ndarray, np.ndarray]: + """ + Normalize probabilities. + """ + total = np.sum(freq) + return languages, freq / total + + def from_region(self, region: str) -> Locale: """ Get a random locale based on the territory ISO code. Returns as a Locale object. """ - language = self.get_random_language() - return normalize_locale(f"{language}-{self.iso_code}") + languages, probabilities = self._load_territory_data(region) + language = np.random.choice(languages, p=probabilities).replace('_', '-') + return normalize_locale(f"{language}-{region}") + + def from_language(self, language: str) -> Locale: + """ + Get a random locale based on the language. + Returns as a Locale object. + """ + regions, probabilities = self._load_language_data(language) + region = np.random.choice(regions, p=probabilities) + return normalize_locale(f"{language}-{region}") -if __name__ == "__main__": - # Extra tests... - from timeit import timeit - - print('LanguageSelector:', timeit(lambda: GetLocaleFromTerritory('ES'), number=100)) - - ts = GetLocaleFromTerritory('ES') - print('get_random_language:', timeit(lambda: ts.get_random_language(), number=10000)) +SELECTOR = StatisticalLocaleSelector() diff --git a/pythonlib/camoufox/utils.py b/pythonlib/camoufox/utils.py index ba8cef7..70a09be 100644 --- a/pythonlib/camoufox/utils.py +++ b/pythonlib/camoufox/utils.py @@ -26,7 +26,7 @@ from .exceptions import ( ) from .fingerprints import from_browserforge, generate_fingerprint from .ip import Proxy, public_ip, valid_ipv4, valid_ipv6 -from .locale import geoip_allowed, get_geolocation, normalize_locale +from .locale import geoip_allowed, get_geolocation, handle_locale from .pkgman import OS_NAME, get_path, installed_verstr from .warnings import LeakWarning from .xpi_dl import add_default_addons @@ -418,7 +418,7 @@ def get_launch_options( # Set locale if locale: - parsed_locale = normalize_locale(locale) + parsed_locale = handle_locale(locale) config.update(parsed_locale.as_config()) # Pass the humanize option diff --git a/pythonlib/camoufox/warnings.yml b/pythonlib/camoufox/warnings.yml index 9d6bee7..4c3d839 100644 --- a/pythonlib/camoufox/warnings.yml +++ b/pythonlib/camoufox/warnings.yml @@ -32,6 +32,10 @@ ff_version: >- If rotating the Firefox version is absolutely necessary, it would be more advisable to rotate between older versions of Camoufox instead. +no_region: >- + Because you did not pass in a locale region, Camoufox will generate one for you. + This can cause suspicion if your IP does not match your locale region. + # headless-non-linux: >- # Headless mode is only recommended on Linux at this time. # Some WAFs are able to detect headless browsers. The issue is currently being investigated. diff --git a/pythonlib/pyproject.toml b/pythonlib/pyproject.toml index e5f7040..e8ab7f3 100644 --- a/pythonlib/pyproject.toml +++ b/pythonlib/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "camoufox" -version = "0.2.12" +version = "0.2.13" description = "Wrapper around Playwright to help launch Camoufox" authors = ["daijro "] license = "MIT"