mirror of
https://forge.fsky.io/oneflux/omegafox.git
synced 2026-02-10 10:12:06 -08:00
Allow passing in a region or language into locale 0.2.13
- Passing in a region like `locale='US'` will generate a language based on the region, "US". - Passing in a language like `locale='en'` will generate a region based on the language, "en". - Fix locale parsing errors when generating using territoryInfo.xml #36
This commit is contained in:
parent
5bb274ddc8
commit
deddba00c3
5 changed files with 142 additions and 50 deletions
|
|
@ -86,7 +86,20 @@ class UnknownIPLocation(LocaleError):
|
||||||
...
|
...
|
||||||
|
|
||||||
|
|
||||||
class UnknownTerritory(LocaleError):
|
class InvalidLocale(LocaleError):
|
||||||
|
"""
|
||||||
|
Raised when the locale input is invalid.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def invalid_input(cls, locale: str) -> 'InvalidLocale':
|
||||||
|
return cls(
|
||||||
|
f"Invalid locale: '{locale}'. Must be either a region, language, "
|
||||||
|
"language-region, or language-script-region."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class UnknownTerritory(InvalidLocale):
|
||||||
"""
|
"""
|
||||||
Raised when the territory is unknown.
|
Raised when the territory is unknown.
|
||||||
"""
|
"""
|
||||||
|
|
@ -94,6 +107,14 @@ class UnknownTerritory(LocaleError):
|
||||||
...
|
...
|
||||||
|
|
||||||
|
|
||||||
|
class UnknownLanguage(InvalidLocale):
|
||||||
|
"""
|
||||||
|
Raised when the language is unknown.
|
||||||
|
"""
|
||||||
|
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
class NotInstalledGeoIPExtra(ImportError):
|
class NotInstalledGeoIPExtra(ImportError):
|
||||||
"""
|
"""
|
||||||
Raised when the geoip2 module is not installed.
|
Raised when the geoip2 module is not installed.
|
||||||
|
|
|
||||||
|
|
@ -7,8 +7,15 @@ import numpy as np
|
||||||
from language_tags import tags
|
from language_tags import tags
|
||||||
|
|
||||||
from camoufox.pkgman import LOCAL_DATA, rprint, webdl
|
from camoufox.pkgman import LOCAL_DATA, rprint, webdl
|
||||||
|
from camoufox.warnings import LeakWarning
|
||||||
|
|
||||||
from .exceptions import NotInstalledGeoIPExtra, UnknownIPLocation, UnknownTerritory
|
from .exceptions import (
|
||||||
|
InvalidLocale,
|
||||||
|
NotInstalledGeoIPExtra,
|
||||||
|
UnknownIPLocation,
|
||||||
|
UnknownLanguage,
|
||||||
|
UnknownTerritory,
|
||||||
|
)
|
||||||
from .ip import validate_ip
|
from .ip import validate_ip
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
@ -90,9 +97,7 @@ def verify_locales(locales: List[str]) -> None:
|
||||||
for loc in locales:
|
for loc in locales:
|
||||||
if tags.check(loc):
|
if tags.check(loc):
|
||||||
continue
|
continue
|
||||||
raise ValueError(
|
raise InvalidLocale.invalid_input(loc)
|
||||||
f"Invalid locale: '{loc}'. All locales must be in the format of language[-script][-region]"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def normalize_locale(locale: str) -> Locale:
|
def normalize_locale(locale: str) -> Locale:
|
||||||
|
|
@ -107,7 +112,7 @@ def normalize_locale(locale: str) -> Locale:
|
||||||
# Parse the locale
|
# Parse the locale
|
||||||
parser = tags.tag(locale)
|
parser = tags.tag(locale)
|
||||||
if not parser.region:
|
if not parser.region:
|
||||||
raise ValueError(f"Invalid locale: {locale}. Region is required.")
|
raise InvalidLocale.invalid_input(locale)
|
||||||
|
|
||||||
record = parser.language.data['record']
|
record = parser.language.data['record']
|
||||||
|
|
||||||
|
|
@ -119,6 +124,29 @@ def normalize_locale(locale: str) -> Locale:
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def handle_locale(locale: str) -> Locale:
|
||||||
|
"""
|
||||||
|
Handles a locale input, normalizing it if necessary.
|
||||||
|
"""
|
||||||
|
if len(locale) > 3:
|
||||||
|
return normalize_locale(locale)
|
||||||
|
|
||||||
|
try:
|
||||||
|
return SELECTOR.from_region(locale)
|
||||||
|
except UnknownTerritory:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
language = SELECTOR.from_language(locale)
|
||||||
|
except UnknownLanguage:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
LeakWarning.warn('no_region')
|
||||||
|
return language
|
||||||
|
|
||||||
|
raise InvalidLocale.invalid_input(locale)
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Helpers to fetch geolocation, timezone, and locale data given an IP.
|
Helpers to fetch geolocation, timezone, and locale data given an IP.
|
||||||
"""
|
"""
|
||||||
|
|
@ -176,7 +204,7 @@ def get_geolocation(ip: str) -> Geolocation:
|
||||||
|
|
||||||
with geoip2.database.Reader(str(MMDB_FILE)) as reader:
|
with geoip2.database.Reader(str(MMDB_FILE)) as reader:
|
||||||
resp = reader.city(ip)
|
resp = reader.city(ip)
|
||||||
iso_code = cast(str, resp.registered_country.iso_code)
|
iso_code = cast(str, resp.registered_country.iso_code).upper()
|
||||||
location = resp.location
|
location = resp.location
|
||||||
|
|
||||||
# Check if any required attributes are missing
|
# Check if any required attributes are missing
|
||||||
|
|
@ -184,8 +212,7 @@ def get_geolocation(ip: str) -> Geolocation:
|
||||||
raise UnknownIPLocation(f"Unknown IP location: {ip}")
|
raise UnknownIPLocation(f"Unknown IP location: {ip}")
|
||||||
|
|
||||||
# Get a statistically correct locale based on the country code
|
# Get a statistically correct locale based on the country code
|
||||||
locale_finder = GetLocaleFromTerritory(iso_code)
|
locale = SELECTOR.from_region(iso_code)
|
||||||
locale = locale_finder.get_locale()
|
|
||||||
|
|
||||||
return Geolocation(
|
return Geolocation(
|
||||||
locale=locale,
|
locale=locale,
|
||||||
|
|
@ -211,60 +238,100 @@ def get_unicode_info() -> ET.Element:
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
class GetLocaleFromTerritory:
|
def _as_float(element: ET.Element, attr: str) -> float:
|
||||||
|
"""
|
||||||
|
Converts an attribute to a float.
|
||||||
|
"""
|
||||||
|
return float(element.get(attr, 0))
|
||||||
|
|
||||||
|
|
||||||
|
class StatisticalLocaleSelector:
|
||||||
|
"""
|
||||||
|
Selects a random locale based on statistical data.
|
||||||
|
Takes either a territory code or a language code, and generates a Locale object.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.root = get_unicode_info()
|
||||||
|
|
||||||
|
def _load_territory_data(self, iso_code: str) -> Tuple[np.ndarray, np.ndarray]:
|
||||||
"""
|
"""
|
||||||
Calculates a random language based on the territory code,
|
Calculates a random language based on the territory code,
|
||||||
based on the probability that a person speaks the language in the territory.
|
based on the probability that a person speaks the language in the territory.
|
||||||
"""
|
"""
|
||||||
|
territory = self.root.find(f"territory[@type='{iso_code}']")
|
||||||
def __init__(self, iso_code: str):
|
|
||||||
self.iso_code = iso_code.upper()
|
|
||||||
self.root = get_unicode_info()
|
|
||||||
self.languages, self.probabilities = self._load_territory_data()
|
|
||||||
|
|
||||||
def _load_territory_data(self) -> Tuple[np.ndarray, np.ndarray]:
|
|
||||||
territory = self.root.find(f"territory[@type='{self.iso_code}']")
|
|
||||||
|
|
||||||
if territory is None:
|
if territory is None:
|
||||||
raise UnknownTerritory(f"Unknown territory: {self.iso_code}")
|
raise UnknownTerritory(f"Unknown territory: {iso_code}")
|
||||||
|
|
||||||
lang_population = territory.findall('languagePopulation')
|
lang_populations = territory.findall('languagePopulation')
|
||||||
|
if not lang_populations:
|
||||||
|
raise ValueError(f"No language data found for region: {iso_code}")
|
||||||
|
|
||||||
if not lang_population:
|
languages = np.array([lang.get('type') for lang in lang_populations])
|
||||||
raise ValueError(f"No language data found for territory: {self.iso_code}")
|
percentages = np.array([_as_float(lang, 'populationPercent') for lang in lang_populations])
|
||||||
|
|
||||||
# Use list comprehension for faster data extraction
|
return self.normalize_probabilities(languages, percentages)
|
||||||
languages = np.array([lang.get('type') for lang in lang_population])
|
|
||||||
percentages = np.array(
|
def _load_language_data(self, language: str) -> Tuple[np.ndarray, np.ndarray]:
|
||||||
[float(lang.get('populationPercent', '0')) for lang in lang_population]
|
"""
|
||||||
|
Calculates a random region for a language
|
||||||
|
based on the total speakers of the language in that region.
|
||||||
|
"""
|
||||||
|
territories = self.root.findall(f'.//territory/languagePopulation[@type="{language}"]/..')
|
||||||
|
if not territories:
|
||||||
|
raise UnknownLanguage(f"No region data found for language: {language}")
|
||||||
|
|
||||||
|
regions = []
|
||||||
|
percentages = []
|
||||||
|
|
||||||
|
for terr in territories:
|
||||||
|
region = terr.get('type')
|
||||||
|
if region is None:
|
||||||
|
continue # Skip if region is not found
|
||||||
|
|
||||||
|
lang_pop = terr.find(f'languagePopulation[@type="{language}"]')
|
||||||
|
if lang_pop is None:
|
||||||
|
continue # This shouldn't happen due to our XPath, but just in case
|
||||||
|
|
||||||
|
regions.append(region)
|
||||||
|
percentages.append(
|
||||||
|
_as_float(lang_pop, 'populationPercent')
|
||||||
|
* _as_float(terr, 'literacyPercent')
|
||||||
|
/ 10_000
|
||||||
|
* _as_float(terr, 'population')
|
||||||
)
|
)
|
||||||
|
|
||||||
# Normalize probabilities
|
if not regions:
|
||||||
total = np.sum(percentages)
|
raise ValueError(f"No valid region data found for language: {language}")
|
||||||
probabilities = percentages / total
|
|
||||||
|
|
||||||
return languages, probabilities
|
return self.normalize_probabilities(np.array(regions), np.array(percentages))
|
||||||
|
|
||||||
def get_random_language(self) -> str:
|
def normalize_probabilities(
|
||||||
|
self, languages: np.ndarray, freq: np.ndarray
|
||||||
|
) -> Tuple[np.ndarray, np.ndarray]:
|
||||||
"""
|
"""
|
||||||
Get a random language based on the territory ISO code.
|
Normalize probabilities.
|
||||||
"""
|
"""
|
||||||
return np.random.choice(self.languages, p=self.probabilities)
|
total = np.sum(freq)
|
||||||
|
return languages, freq / total
|
||||||
|
|
||||||
def get_locale(self) -> Locale:
|
def from_region(self, region: str) -> Locale:
|
||||||
"""
|
"""
|
||||||
Get a random locale based on the territory ISO code.
|
Get a random locale based on the territory ISO code.
|
||||||
Returns as a Locale object.
|
Returns as a Locale object.
|
||||||
"""
|
"""
|
||||||
language = self.get_random_language()
|
languages, probabilities = self._load_territory_data(region)
|
||||||
return normalize_locale(f"{language}-{self.iso_code}")
|
language = np.random.choice(languages, p=probabilities).replace('_', '-')
|
||||||
|
return normalize_locale(f"{language}-{region}")
|
||||||
|
|
||||||
|
def from_language(self, language: str) -> Locale:
|
||||||
|
"""
|
||||||
|
Get a random locale based on the language.
|
||||||
|
Returns as a Locale object.
|
||||||
|
"""
|
||||||
|
regions, probabilities = self._load_language_data(language)
|
||||||
|
region = np.random.choice(regions, p=probabilities)
|
||||||
|
return normalize_locale(f"{language}-{region}")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
SELECTOR = StatisticalLocaleSelector()
|
||||||
# Extra tests...
|
|
||||||
from timeit import timeit
|
|
||||||
|
|
||||||
print('LanguageSelector:', timeit(lambda: GetLocaleFromTerritory('ES'), number=100))
|
|
||||||
|
|
||||||
ts = GetLocaleFromTerritory('ES')
|
|
||||||
print('get_random_language:', timeit(lambda: ts.get_random_language(), number=10000))
|
|
||||||
|
|
|
||||||
|
|
@ -26,7 +26,7 @@ from .exceptions import (
|
||||||
)
|
)
|
||||||
from .fingerprints import from_browserforge, generate_fingerprint
|
from .fingerprints import from_browserforge, generate_fingerprint
|
||||||
from .ip import Proxy, public_ip, valid_ipv4, valid_ipv6
|
from .ip import Proxy, public_ip, valid_ipv4, valid_ipv6
|
||||||
from .locale import geoip_allowed, get_geolocation, normalize_locale
|
from .locale import geoip_allowed, get_geolocation, handle_locale
|
||||||
from .pkgman import OS_NAME, get_path, installed_verstr
|
from .pkgman import OS_NAME, get_path, installed_verstr
|
||||||
from .warnings import LeakWarning
|
from .warnings import LeakWarning
|
||||||
from .xpi_dl import add_default_addons
|
from .xpi_dl import add_default_addons
|
||||||
|
|
@ -418,7 +418,7 @@ def get_launch_options(
|
||||||
|
|
||||||
# Set locale
|
# Set locale
|
||||||
if locale:
|
if locale:
|
||||||
parsed_locale = normalize_locale(locale)
|
parsed_locale = handle_locale(locale)
|
||||||
config.update(parsed_locale.as_config())
|
config.update(parsed_locale.as_config())
|
||||||
|
|
||||||
# Pass the humanize option
|
# Pass the humanize option
|
||||||
|
|
|
||||||
|
|
@ -32,6 +32,10 @@ ff_version: >-
|
||||||
If rotating the Firefox version is absolutely necessary, it would be more advisable to
|
If rotating the Firefox version is absolutely necessary, it would be more advisable to
|
||||||
rotate between older versions of Camoufox instead.
|
rotate between older versions of Camoufox instead.
|
||||||
|
|
||||||
|
no_region: >-
|
||||||
|
Because you did not pass in a locale region, Camoufox will generate one for you.
|
||||||
|
This can cause suspicion if your IP does not match your locale region.
|
||||||
|
|
||||||
# headless-non-linux: >-
|
# headless-non-linux: >-
|
||||||
# Headless mode is only recommended on Linux at this time.
|
# Headless mode is only recommended on Linux at this time.
|
||||||
# Some WAFs are able to detect headless browsers. The issue is currently being investigated.
|
# Some WAFs are able to detect headless browsers. The issue is currently being investigated.
|
||||||
|
|
|
||||||
|
|
@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
||||||
|
|
||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "camoufox"
|
name = "camoufox"
|
||||||
version = "0.2.12"
|
version = "0.2.13"
|
||||||
description = "Wrapper around Playwright to help launch Camoufox"
|
description = "Wrapper around Playwright to help launch Camoufox"
|
||||||
authors = ["daijro <daijro.dev@gmail.com>"]
|
authors = ["daijro <daijro.dev@gmail.com>"]
|
||||||
license = "MIT"
|
license = "MIT"
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue