Source code for paper_firehose.core.text_utils

"""Shared text processing utilities.

Consolidates text normalization, cleaning, and matching functions used across
the codebase for author names, abstracts, and other text fields.
"""

import re
import html as htmllib
import unicodedata
from typing import Optional, List, Tuple


[docs] def strip_jats(text: Optional[str]) -> Optional[str]: """Remove JATS/HTML tags and unescape entities in Crossref-style strings. JATS (Journal Article Tag Suite) is an XML format used by publishers. Crossref and other APIs often return abstracts with JATS tags embedded. Args: text: Text potentially containing JATS/HTML tags Returns: Cleaned text with tags removed and entities unescaped, or None if input was None Examples: >>> strip_jats("<jats:p>Some text</jats:p>") 'Some text' >>> strip_jats("Text with &lt;angle&gt; brackets") 'Text with <angle> brackets' """ if not text: return text # Remove <jats:...> and regular HTML tags text = re.sub(r"</?jats:[^>]+>", "", text, flags=re.IGNORECASE) text = re.sub(r"<[^>]+>", "", text) # Unescape HTML entities like &lt; &gt; &amp; return htmllib.unescape(text).strip()
[docs] def clean_abstract_for_db(text: Optional[str]) -> Optional[str]: """Conservative sanitizer for abstracts before storing in database. Performs comprehensive cleaning: - Removes JATS/HTML tags and unescapes entities via strip_jats() - Strips stray '<' and '>' characters (common artifact from feeds) - Removes leading feed prefixes like "Abstract" and arXiv announce headers - Normalizes whitespace and removes zero-width characters Args: text: Raw abstract text from API or feed Returns: Cleaned abstract ready for database storage, or None if input was None Examples: >>> clean_abstract_for_db("Abstract: This is the abstract.") 'This is the abstract.' >>> clean_abstract_for_db("arXiv:2509.09390v1 Announce Type: new Abstract: Text") 'Text' """ if text is None: return None # First remove tags and unescape entities s = strip_jats(text) or "" # Remove zero-width and BOM-like chars s = s.replace("\u200B", "").replace("\u200C", "").replace("\u200D", "").replace("\uFEFF", "") # Normalize non-breaking spaces s = s.replace("\xa0", " ") # Remove any remaining angle bracket characters which often leak from markup s = s.replace("<", "").replace(">", "") # Drop leading arXiv announce header like: # "arXiv:2509.09390v1 Announce Type: new Abstract: ..." s = re.sub(r"^\s*arXiv:[^\n]*?(?:Announce\s+Type:\s*\w+\s+)?Abstract:\s*", "", s, flags=re.IGNORECASE) # Drop simple leading "Abstract" or "Abstract:" tokens s = re.sub(r"^\s*Abstract\s*:?[\s\-–—]*", "", s, flags=re.IGNORECASE) # Collapse excessive whitespace s = re.sub(r"[\t\r ]+", " ", s) s = re.sub(r"\n{3,}", "\n\n", s) return s.strip()
[docs] def strip_accents(text: str) -> str: """Return ASCII-ish text by removing accent marks via Unicode normalization. Useful for comparing author names and other text where accents should not affect matching. Args: text: Text potentially containing accented characters Returns: Text with accent marks removed Examples: >>> strip_accents("José García") 'Jose Garcia' >>> strip_accents("Müller") 'Muller' """ return "".join( c for c in unicodedata.normalize("NFKD", text) if not unicodedata.combining(c) )
[docs] def normalize_name(text: str) -> str: """Normalize a human name for loose matching. Strips accents, punctuation, and converts to lowercase for fuzzy name matching. Args: text: Human name to normalize Returns: Normalized name suitable for comparison Examples: >>> normalize_name("García-López, José") 'garcia lopez jose' >>> normalize_name("John P. Smith") 'john p smith' """ t = strip_accents(text or "").lower() # Keep only letters, spaces, and hyphens t = re.sub(r"[^a-z\s\-]", " ", t) # Collapse multiple spaces t = re.sub(r"\s+", " ", t).strip() return t
[docs] def parse_name_parts(name: str) -> Tuple[str, List[str]]: """Parse a human name into (lastname, initials[]). Handles both "Last, First M" and "First M Last" styles, ignoring accents and case for robust parsing. Args: name: Full name in various formats Returns: Tuple of (lastname, list of first/middle initials) Examples: >>> parse_name_parts("Smith, John P.") ('smith', ['j', 'p']) >>> parse_name_parts("John P. Smith") ('smith', ['j', 'p']) >>> parse_name_parts("García-López, José") ('garcia lopez', ['j']) """ if not name: return "", [] # Preserve comma pattern before normalization for ordering hint if "," in name: last_raw, _, rest_raw = name.partition(",") last = normalize_name(last_raw) rest = normalize_name(rest_raw) tokens = rest.split() else: n = normalize_name(name) tokens = n.split() last = tokens[-1] if tokens else "" tokens = tokens[:-1] # Extract first letter of each remaining token as initial initials = [t[0] for t in tokens if t] return last, initials
[docs] def names_match(a: str, b: str) -> bool: """Heuristic author-name comparator supporting initials and comma forms. Compares two author names with fuzzy matching that handles: - Different name orderings (Last, First vs First Last) - Initials vs full first names - Accents and punctuation differences Args: a: First author name b: Second author name Returns: True if names likely refer to the same person, False otherwise Examples: >>> names_match("Smith, J. P.", "John P. Smith") True >>> names_match("J. Smith", "Jane Smith") True >>> names_match("J. Smith", "John Doe") False """ la, ia = parse_name_parts(a) lb, ib = parse_name_parts(b) # Both must have a last name if not la or not lb: return False # Last names must match if la != lb: return False # If both have initials, at least one must overlap if ia and ib and not set(ia).intersection(ib): return False return True