"""Shared text processing utilities.
Consolidates text normalization, cleaning, and matching functions used across
the codebase for author names, abstracts, and other text fields.
"""
import re
import html as htmllib
import unicodedata
from typing import Optional, List, Tuple
[docs]
def strip_jats(text: Optional[str]) -> Optional[str]:
"""Remove JATS/HTML tags and unescape entities in Crossref-style strings.
JATS (Journal Article Tag Suite) is an XML format used by publishers. Crossref
and other APIs often return abstracts with JATS tags embedded.
Args:
text: Text potentially containing JATS/HTML tags
Returns:
Cleaned text with tags removed and entities unescaped, or None if input was None
Examples:
>>> strip_jats("<jats:p>Some text</jats:p>")
'Some text'
>>> strip_jats("Text with <angle> brackets")
'Text with <angle> brackets'
"""
if not text:
return text
# Remove <jats:...> and regular HTML tags
text = re.sub(r"</?jats:[^>]+>", "", text, flags=re.IGNORECASE)
text = re.sub(r"<[^>]+>", "", text)
# Unescape HTML entities like < > &
return htmllib.unescape(text).strip()
[docs]
def clean_abstract_for_db(text: Optional[str]) -> Optional[str]:
"""Conservative sanitizer for abstracts before storing in database.
Performs comprehensive cleaning:
- Removes JATS/HTML tags and unescapes entities via strip_jats()
- Strips stray '<' and '>' characters (common artifact from feeds)
- Removes leading feed prefixes like "Abstract" and arXiv announce headers
- Normalizes whitespace and removes zero-width characters
Args:
text: Raw abstract text from API or feed
Returns:
Cleaned abstract ready for database storage, or None if input was None
Examples:
>>> clean_abstract_for_db("Abstract: This is the abstract.")
'This is the abstract.'
>>> clean_abstract_for_db("arXiv:2509.09390v1 Announce Type: new Abstract: Text")
'Text'
"""
if text is None:
return None
# First remove tags and unescape entities
s = strip_jats(text) or ""
# Remove zero-width and BOM-like chars
s = s.replace("\u200B", "").replace("\u200C", "").replace("\u200D", "").replace("\uFEFF", "")
# Normalize non-breaking spaces
s = s.replace("\xa0", " ")
# Remove any remaining angle bracket characters which often leak from markup
s = s.replace("<", "").replace(">", "")
# Drop leading arXiv announce header like:
# "arXiv:2509.09390v1 Announce Type: new Abstract: ..."
s = re.sub(r"^\s*arXiv:[^\n]*?(?:Announce\s+Type:\s*\w+\s+)?Abstract:\s*", "", s, flags=re.IGNORECASE)
# Drop simple leading "Abstract" or "Abstract:" tokens
s = re.sub(r"^\s*Abstract\s*:?[\s\-–—]*", "", s, flags=re.IGNORECASE)
# Collapse excessive whitespace
s = re.sub(r"[\t\r ]+", " ", s)
s = re.sub(r"\n{3,}", "\n\n", s)
return s.strip()
[docs]
def strip_accents(text: str) -> str:
"""Return ASCII-ish text by removing accent marks via Unicode normalization.
Useful for comparing author names and other text where accents should not
affect matching.
Args:
text: Text potentially containing accented characters
Returns:
Text with accent marks removed
Examples:
>>> strip_accents("José García")
'Jose Garcia'
>>> strip_accents("Müller")
'Muller'
"""
return "".join(
c for c in unicodedata.normalize("NFKD", text)
if not unicodedata.combining(c)
)
[docs]
def normalize_name(text: str) -> str:
"""Normalize a human name for loose matching.
Strips accents, punctuation, and converts to lowercase for fuzzy name matching.
Args:
text: Human name to normalize
Returns:
Normalized name suitable for comparison
Examples:
>>> normalize_name("García-López, José")
'garcia lopez jose'
>>> normalize_name("John P. Smith")
'john p smith'
"""
t = strip_accents(text or "").lower()
# Keep only letters, spaces, and hyphens
t = re.sub(r"[^a-z\s\-]", " ", t)
# Collapse multiple spaces
t = re.sub(r"\s+", " ", t).strip()
return t
[docs]
def parse_name_parts(name: str) -> Tuple[str, List[str]]:
"""Parse a human name into (lastname, initials[]).
Handles both "Last, First M" and "First M Last" styles, ignoring accents
and case for robust parsing.
Args:
name: Full name in various formats
Returns:
Tuple of (lastname, list of first/middle initials)
Examples:
>>> parse_name_parts("Smith, John P.")
('smith', ['j', 'p'])
>>> parse_name_parts("John P. Smith")
('smith', ['j', 'p'])
>>> parse_name_parts("García-López, José")
('garcia lopez', ['j'])
"""
if not name:
return "", []
# Preserve comma pattern before normalization for ordering hint
if "," in name:
last_raw, _, rest_raw = name.partition(",")
last = normalize_name(last_raw)
rest = normalize_name(rest_raw)
tokens = rest.split()
else:
n = normalize_name(name)
tokens = n.split()
last = tokens[-1] if tokens else ""
tokens = tokens[:-1]
# Extract first letter of each remaining token as initial
initials = [t[0] for t in tokens if t]
return last, initials
[docs]
def names_match(a: str, b: str) -> bool:
"""Heuristic author-name comparator supporting initials and comma forms.
Compares two author names with fuzzy matching that handles:
- Different name orderings (Last, First vs First Last)
- Initials vs full first names
- Accents and punctuation differences
Args:
a: First author name
b: Second author name
Returns:
True if names likely refer to the same person, False otherwise
Examples:
>>> names_match("Smith, J. P.", "John P. Smith")
True
>>> names_match("J. Smith", "Jane Smith")
True
>>> names_match("J. Smith", "John Doe")
False
"""
la, ia = parse_name_parts(a)
lb, ib = parse_name_parts(b)
# Both must have a last name
if not la or not lb:
return False
# Last names must match
if la != lb:
return False
# If both have initials, at least one must overlap
if ia and ib and not set(ia).intersection(ib):
return False
return True