Module foss42.text.slugify
Expand source code
import re
import unicodedata
import unidecode
from html.entities import name2codepoint
from typing import Optional
from typeguard import typechecked
CHAR_ENTITY_PATTERN = re.compile(r'&(%s);' % '|'.join(name2codepoint))
DECIMAL_PATTERN = re.compile(r'&#(\d+);')
HEX_PATTERN = re.compile(r'&#x([\da-fA-F]+);')
QUOTE_PATTERN = re.compile(r'[\']+')
DISALLOWED_CHARS_PATTERN = re.compile(r'[^-a-zA-Z0-9]+')
DISALLOWED_UNICODE_CHARS_PATTERN = re.compile(r'[\W_]+')
DUPLICATE_DASH_PATTERN = re.compile(r'-{2,}')
NUMBERS_PATTERN = re.compile(r'(?<=\d),(?=\d)')
DEFAULT_SEPARATOR = '-'
@typechecked
def slugify(text: str,
separator: str = DEFAULT_SEPARATOR,
regex_pattern: str = None,
replacements: Optional[list[tuple[str, str]]]= None) -> str:
"""
Make a slug from the given text.
>>> slugify(" aryan #$$ ")
'aryan'
>>> slugify("one kožušček")
'one-kozuscek'
>>> slugify("one TWO")
'one-two'
>>> slugify("Дrаft №2.txt")
'draft-no-2-txt'
>>> slugify("Я ♥ борщ")
'ia-borshch'
>>> slugify("ÜBER Über ")
'uber-uber'
>>> slugify("This is a test ---")
'this-is-a-test'
>>> slugify("影師嗎")
'ying-shi-ma'
>>> slugify("C'est déjà l'été.")
'c-est-deja-l-ete'
>>> slugify("Nín hǎo. Wǒ shì zhōng guó rén")
'nin-hao-wo-shi-zhong-guo-ren'
>>> slugify("Компьютер")
'kompiuter'
>>> slugify("jaja---lol-méméméoo--a")
'jaja-lol-mememeoo-a'
>>> slugify("10 | 20 %")
'10-20'
>>> slugify('i love 🦄')
'i-love'
"""
# user-specific replacements
if replacements:
for old, new in replacements:
text = text.replace(old, new)
# ensure text is unicode
if not isinstance(text, str):
text = str(text, 'utf-8', 'ignore')
# replace quotes with dashes - pre-process
text = QUOTE_PATTERN.sub(DEFAULT_SEPARATOR, text)
# decode unicode
text = unidecode.unidecode(text)
# ensure text is still in unicode
if not isinstance(text, str):
text = str(text, 'utf-8', 'ignore')
text = CHAR_ENTITY_PATTERN.sub(lambda m: chr(name2codepoint[m.group(1)]), text)
# decimal character reference
try:
text = DECIMAL_PATTERN.sub(lambda m: chr(int(m.group(1))), text)
except Exception:
pass
# hexadecimal character reference
try:
text = HEX_PATTERN.sub(lambda m: chr(int(m.group(1), 16)), text)
except Exception:
pass
# translate
text = unicodedata.normalize('NFKD', text)
# make the text lowercase
text = text.lower()
# remove generated quotes -- post-process
text = QUOTE_PATTERN.sub('', text)
# cleanup numbers
text = NUMBERS_PATTERN.sub('', text)
# replace all other unwanted characters
pattern = regex_pattern or DISALLOWED_CHARS_PATTERN
text = re.sub(pattern, DEFAULT_SEPARATOR, text)
# remove redundant
text = DUPLICATE_DASH_PATTERN.sub(DEFAULT_SEPARATOR, text).strip(DEFAULT_SEPARATOR)
# finalize user-specific replacements
if replacements:
for old, new in replacements:
text = text.replace(old, new)
if separator != DEFAULT_SEPARATOR:
text = text.replace(DEFAULT_SEPARATOR, separator)
return text
Functions
def slugify(text: str, separator: str = '-', regex_pattern: str = None, replacements: Optional[list[tuple[str, str]]] = None) ‑> str
-
Make a slug from the given text.
>>> slugify(" aryan #$$ ") 'aryan'
>>> slugify("one kožušček") 'one-kozuscek'
>>> slugify("one TWO") 'one-two'
>>> slugify("Дrаft №2.txt") 'draft-no-2-txt'
>>> slugify("Я ♥ борщ") 'ia-borshch'
>>> slugify("ÜBER Über ") 'uber-uber'
>>> slugify("This is a test ---") 'this-is-a-test'
>>> slugify("影師嗎") 'ying-shi-ma'
>>> slugify("C'est déjà l'été.") 'c-est-deja-l-ete'
>>> slugify("Nín hǎo. Wǒ shì zhōng guó rén") 'nin-hao-wo-shi-zhong-guo-ren'
>>> slugify("Компьютер") 'kompiuter'
>>> slugify("jaja---lol-méméméoo--a") 'jaja-lol-mememeoo-a'
>>> slugify("10 | 20 %") '10-20'
>>> slugify('i love 🦄') 'i-love'
Expand source code
@typechecked def slugify(text: str, separator: str = DEFAULT_SEPARATOR, regex_pattern: str = None, replacements: Optional[list[tuple[str, str]]]= None) -> str: """ Make a slug from the given text. >>> slugify(" aryan #$$ ") 'aryan' >>> slugify("one kožušček") 'one-kozuscek' >>> slugify("one TWO") 'one-two' >>> slugify("Дrаft №2.txt") 'draft-no-2-txt' >>> slugify("Я ♥ борщ") 'ia-borshch' >>> slugify("ÜBER Über ") 'uber-uber' >>> slugify("This is a test ---") 'this-is-a-test' >>> slugify("影師嗎") 'ying-shi-ma' >>> slugify("C'est déjà l'été.") 'c-est-deja-l-ete' >>> slugify("Nín hǎo. Wǒ shì zhōng guó rén") 'nin-hao-wo-shi-zhong-guo-ren' >>> slugify("Компьютер") 'kompiuter' >>> slugify("jaja---lol-méméméoo--a") 'jaja-lol-mememeoo-a' >>> slugify("10 | 20 %") '10-20' >>> slugify('i love 🦄') 'i-love' """ # user-specific replacements if replacements: for old, new in replacements: text = text.replace(old, new) # ensure text is unicode if not isinstance(text, str): text = str(text, 'utf-8', 'ignore') # replace quotes with dashes - pre-process text = QUOTE_PATTERN.sub(DEFAULT_SEPARATOR, text) # decode unicode text = unidecode.unidecode(text) # ensure text is still in unicode if not isinstance(text, str): text = str(text, 'utf-8', 'ignore') text = CHAR_ENTITY_PATTERN.sub(lambda m: chr(name2codepoint[m.group(1)]), text) # decimal character reference try: text = DECIMAL_PATTERN.sub(lambda m: chr(int(m.group(1))), text) except Exception: pass # hexadecimal character reference try: text = HEX_PATTERN.sub(lambda m: chr(int(m.group(1), 16)), text) except Exception: pass # translate text = unicodedata.normalize('NFKD', text) # make the text lowercase text = text.lower() # remove generated quotes -- post-process text = QUOTE_PATTERN.sub('', text) # cleanup numbers text = NUMBERS_PATTERN.sub('', text) # replace all other unwanted characters pattern = regex_pattern or DISALLOWED_CHARS_PATTERN text = re.sub(pattern, DEFAULT_SEPARATOR, text) # remove redundant text = DUPLICATE_DASH_PATTERN.sub(DEFAULT_SEPARATOR, text).strip(DEFAULT_SEPARATOR) # finalize user-specific replacements if replacements: for old, new in replacements: text = text.replace(old, new) if separator != DEFAULT_SEPARATOR: text = text.replace(DEFAULT_SEPARATOR, separator) return text