Allow passing in a region or language into locale 0.2.13

- Passing in a region like `locale='US'` will generate a language based on the region, "US".
- Passing in a language like `locale='en'` will generate a region based on the language, "en".
- Fix locale parsing errors when generating using territoryInfo.xml #36
This commit is contained in:
daijro 2024-10-11 21:36:19 -05:00
parent 5bb274ddc8
commit deddba00c3
5 changed files with 142 additions and 50 deletions

View file

@ -86,7 +86,20 @@ class UnknownIPLocation(LocaleError):
... ...
class UnknownTerritory(LocaleError): class InvalidLocale(LocaleError):
"""
Raised when the locale input is invalid.
"""
@classmethod
def invalid_input(cls, locale: str) -> 'InvalidLocale':
return cls(
f"Invalid locale: '{locale}'. Must be either a region, language, "
"language-region, or language-script-region."
)
class UnknownTerritory(InvalidLocale):
""" """
Raised when the territory is unknown. Raised when the territory is unknown.
""" """
@ -94,6 +107,14 @@ class UnknownTerritory(LocaleError):
... ...
class UnknownLanguage(InvalidLocale):
"""
Raised when the language is unknown.
"""
...
class NotInstalledGeoIPExtra(ImportError): class NotInstalledGeoIPExtra(ImportError):
""" """
Raised when the geoip2 module is not installed. Raised when the geoip2 module is not installed.

View file

@ -7,8 +7,15 @@ import numpy as np
from language_tags import tags from language_tags import tags
from camoufox.pkgman import LOCAL_DATA, rprint, webdl from camoufox.pkgman import LOCAL_DATA, rprint, webdl
from camoufox.warnings import LeakWarning
from .exceptions import NotInstalledGeoIPExtra, UnknownIPLocation, UnknownTerritory from .exceptions import (
InvalidLocale,
NotInstalledGeoIPExtra,
UnknownIPLocation,
UnknownLanguage,
UnknownTerritory,
)
from .ip import validate_ip from .ip import validate_ip
try: try:
@ -90,9 +97,7 @@ def verify_locales(locales: List[str]) -> None:
for loc in locales: for loc in locales:
if tags.check(loc): if tags.check(loc):
continue continue
raise ValueError( raise InvalidLocale.invalid_input(loc)
f"Invalid locale: '{loc}'. All locales must be in the format of language[-script][-region]"
)
def normalize_locale(locale: str) -> Locale: def normalize_locale(locale: str) -> Locale:
@ -107,7 +112,7 @@ def normalize_locale(locale: str) -> Locale:
# Parse the locale # Parse the locale
parser = tags.tag(locale) parser = tags.tag(locale)
if not parser.region: if not parser.region:
raise ValueError(f"Invalid locale: {locale}. Region is required.") raise InvalidLocale.invalid_input(locale)
record = parser.language.data['record'] record = parser.language.data['record']
@ -119,6 +124,29 @@ def normalize_locale(locale: str) -> Locale:
) )
def handle_locale(locale: str) -> Locale:
"""
Handles a locale input, normalizing it if necessary.
"""
if len(locale) > 3:
return normalize_locale(locale)
try:
return SELECTOR.from_region(locale)
except UnknownTerritory:
pass
try:
language = SELECTOR.from_language(locale)
except UnknownLanguage:
pass
else:
LeakWarning.warn('no_region')
return language
raise InvalidLocale.invalid_input(locale)
""" """
Helpers to fetch geolocation, timezone, and locale data given an IP. Helpers to fetch geolocation, timezone, and locale data given an IP.
""" """
@ -176,7 +204,7 @@ def get_geolocation(ip: str) -> Geolocation:
with geoip2.database.Reader(str(MMDB_FILE)) as reader: with geoip2.database.Reader(str(MMDB_FILE)) as reader:
resp = reader.city(ip) resp = reader.city(ip)
iso_code = cast(str, resp.registered_country.iso_code) iso_code = cast(str, resp.registered_country.iso_code).upper()
location = resp.location location = resp.location
# Check if any required attributes are missing # Check if any required attributes are missing
@ -184,8 +212,7 @@ def get_geolocation(ip: str) -> Geolocation:
raise UnknownIPLocation(f"Unknown IP location: {ip}") raise UnknownIPLocation(f"Unknown IP location: {ip}")
# Get a statistically correct locale based on the country code # Get a statistically correct locale based on the country code
locale_finder = GetLocaleFromTerritory(iso_code) locale = SELECTOR.from_region(iso_code)
locale = locale_finder.get_locale()
return Geolocation( return Geolocation(
locale=locale, locale=locale,
@ -211,60 +238,100 @@ def get_unicode_info() -> ET.Element:
return data return data
class GetLocaleFromTerritory: def _as_float(element: ET.Element, attr: str) -> float:
"""
Converts an attribute to a float.
"""
return float(element.get(attr, 0))
class StatisticalLocaleSelector:
"""
Selects a random locale based on statistical data.
Takes either a territory code or a language code, and generates a Locale object.
"""
def __init__(self):
self.root = get_unicode_info()
def _load_territory_data(self, iso_code: str) -> Tuple[np.ndarray, np.ndarray]:
""" """
Calculates a random language based on the territory code, Calculates a random language based on the territory code,
based on the probability that a person speaks the language in the territory. based on the probability that a person speaks the language in the territory.
""" """
territory = self.root.find(f"territory[@type='{iso_code}']")
def __init__(self, iso_code: str):
self.iso_code = iso_code.upper()
self.root = get_unicode_info()
self.languages, self.probabilities = self._load_territory_data()
def _load_territory_data(self) -> Tuple[np.ndarray, np.ndarray]:
territory = self.root.find(f"territory[@type='{self.iso_code}']")
if territory is None: if territory is None:
raise UnknownTerritory(f"Unknown territory: {self.iso_code}") raise UnknownTerritory(f"Unknown territory: {iso_code}")
lang_population = territory.findall('languagePopulation') lang_populations = territory.findall('languagePopulation')
if not lang_populations:
raise ValueError(f"No language data found for region: {iso_code}")
if not lang_population: languages = np.array([lang.get('type') for lang in lang_populations])
raise ValueError(f"No language data found for territory: {self.iso_code}") percentages = np.array([_as_float(lang, 'populationPercent') for lang in lang_populations])
# Use list comprehension for faster data extraction return self.normalize_probabilities(languages, percentages)
languages = np.array([lang.get('type') for lang in lang_population])
percentages = np.array( def _load_language_data(self, language: str) -> Tuple[np.ndarray, np.ndarray]:
[float(lang.get('populationPercent', '0')) for lang in lang_population] """
Calculates a random region for a language
based on the total speakers of the language in that region.
"""
territories = self.root.findall(f'.//territory/languagePopulation[@type="{language}"]/..')
if not territories:
raise UnknownLanguage(f"No region data found for language: {language}")
regions = []
percentages = []
for terr in territories:
region = terr.get('type')
if region is None:
continue # Skip if region is not found
lang_pop = terr.find(f'languagePopulation[@type="{language}"]')
if lang_pop is None:
continue # This shouldn't happen due to our XPath, but just in case
regions.append(region)
percentages.append(
_as_float(lang_pop, 'populationPercent')
* _as_float(terr, 'literacyPercent')
/ 10_000
* _as_float(terr, 'population')
) )
# Normalize probabilities if not regions:
total = np.sum(percentages) raise ValueError(f"No valid region data found for language: {language}")
probabilities = percentages / total
return languages, probabilities return self.normalize_probabilities(np.array(regions), np.array(percentages))
def get_random_language(self) -> str: def normalize_probabilities(
self, languages: np.ndarray, freq: np.ndarray
) -> Tuple[np.ndarray, np.ndarray]:
""" """
Get a random language based on the territory ISO code. Normalize probabilities.
""" """
return np.random.choice(self.languages, p=self.probabilities) total = np.sum(freq)
return languages, freq / total
def get_locale(self) -> Locale: def from_region(self, region: str) -> Locale:
""" """
Get a random locale based on the territory ISO code. Get a random locale based on the territory ISO code.
Returns as a Locale object. Returns as a Locale object.
""" """
language = self.get_random_language() languages, probabilities = self._load_territory_data(region)
return normalize_locale(f"{language}-{self.iso_code}") language = np.random.choice(languages, p=probabilities).replace('_', '-')
return normalize_locale(f"{language}-{region}")
def from_language(self, language: str) -> Locale:
"""
Get a random locale based on the language.
Returns as a Locale object.
"""
regions, probabilities = self._load_language_data(language)
region = np.random.choice(regions, p=probabilities)
return normalize_locale(f"{language}-{region}")
if __name__ == "__main__": SELECTOR = StatisticalLocaleSelector()
# Extra tests...
from timeit import timeit
print('LanguageSelector:', timeit(lambda: GetLocaleFromTerritory('ES'), number=100))
ts = GetLocaleFromTerritory('ES')
print('get_random_language:', timeit(lambda: ts.get_random_language(), number=10000))

View file

@ -26,7 +26,7 @@ from .exceptions import (
) )
from .fingerprints import from_browserforge, generate_fingerprint from .fingerprints import from_browserforge, generate_fingerprint
from .ip import Proxy, public_ip, valid_ipv4, valid_ipv6 from .ip import Proxy, public_ip, valid_ipv4, valid_ipv6
from .locale import geoip_allowed, get_geolocation, normalize_locale from .locale import geoip_allowed, get_geolocation, handle_locale
from .pkgman import OS_NAME, get_path, installed_verstr from .pkgman import OS_NAME, get_path, installed_verstr
from .warnings import LeakWarning from .warnings import LeakWarning
from .xpi_dl import add_default_addons from .xpi_dl import add_default_addons
@ -418,7 +418,7 @@ def get_launch_options(
# Set locale # Set locale
if locale: if locale:
parsed_locale = normalize_locale(locale) parsed_locale = handle_locale(locale)
config.update(parsed_locale.as_config()) config.update(parsed_locale.as_config())
# Pass the humanize option # Pass the humanize option

View file

@ -32,6 +32,10 @@ ff_version: >-
If rotating the Firefox version is absolutely necessary, it would be more advisable to If rotating the Firefox version is absolutely necessary, it would be more advisable to
rotate between older versions of Camoufox instead. rotate between older versions of Camoufox instead.
no_region: >-
Because you did not pass in a locale region, Camoufox will generate one for you.
This can cause suspicion if your IP does not match your locale region.
# headless-non-linux: >- # headless-non-linux: >-
# Headless mode is only recommended on Linux at this time. # Headless mode is only recommended on Linux at this time.
# Some WAFs are able to detect headless browsers. The issue is currently being investigated. # Some WAFs are able to detect headless browsers. The issue is currently being investigated.

View file

@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
[tool.poetry] [tool.poetry]
name = "camoufox" name = "camoufox"
version = "0.2.12" version = "0.2.13"
description = "Wrapper around Playwright to help launch Camoufox" description = "Wrapper around Playwright to help launch Camoufox"
authors = ["daijro <daijro.dev@gmail.com>"] authors = ["daijro <daijro.dev@gmail.com>"]
license = "MIT" license = "MIT"