Skip to content

hdx.utilities.text

Text processing utilities.

remove_end_characters

def remove_end_characters(string: str,
                          characters_to_remove: str = punctuation) -> str

[view_source]

Remove any characters at end of string that are in characters_to_remove.

Arguments:

  • string str - Input string
  • characters_to_remove str - Characters to remove. Defaults to punctuation.

Returns:

  • str - String with any characters at end of string that are in characters_to_remove removed

remove_from_end

def remove_from_end(string: str,
                    things_to_remove: List[str],
                    logging_text: Optional[str] = None,
                    whole_words: bool = True) -> str

[view_source]

Remove list of items from end of string, stripping any whitespace.

Arguments:

  • string str - Input string
  • things_to_remove List[str] - Things to remove from the end of string
  • logging_text Optional[str] - Text to log. Defaults to None.
  • whole_words bool - Remove parts of or whole words. Defaults to True (whole words only).

Returns:

  • str - String with text removed

remove_string

def remove_string(string: str,
                  toremove: str,
                  end_characters_to_remove: str = punctuation) -> str

[view_source]

Remove string from another string and delete any preceding end characters - by default punctuation (eg. comma) and any whitespace following the punctuation

Arguments:

  • string str - String to process
  • toremove str - String to remove
  • end_characters_to_remove str - Characters to remove. Defaults to punctuation.

Returns:

  • str - String with other string removed

multiple_replace

def multiple_replace(string: str, replacements: Dict[str, str]) -> str

[view_source]

Simultaneously replace multiple strings in a string.

Arguments:

  • string str - Input string
  • replacements Dict[str,str] - Replacements dictionary

Returns:

  • str - String with replacements

get_words_in_sentence

def get_words_in_sentence(sentence: str) -> List[str]

[view_source]

Returns list of words in a sentence.

Arguments:

  • sentence str - Sentence

Returns:

  • List[str] - List of words in sentence

get_matching_text_in_strs

def get_matching_text_in_strs(a: str,
                              b: str,
                              match_min_size: int = 30,
                              ignore: str = "",
                              end_characters: str = "") -> List[str]

[view_source]

Returns a list of matching blocks of text in a and b.

Arguments:

  • a str - First string to match
  • b str - Second string to match
  • match_min_size int - Minimum block size to match on. Defaults to 30.
  • ignore str - Any characters to ignore in matching. Defaults to ''.
  • end_characters str - End characters to look for. Defaults to ''.

Returns:

  • List[str] - List of matching blocks of text

get_matching_text

def get_matching_text(string_list: List[str],
                      match_min_size: int = 30,
                      ignore: str = "",
                      end_characters: str = ".!\r\n") -> str

[view_source]

Returns a string containing matching blocks of text in a list of strings followed by non-matching.

Arguments:

  • string_list List[str] - List of strings to match
  • match_min_size int - Minimum block size to match on. Defaults to 30.
  • ignore str - Any characters to ignore in matching. Defaults to ''.
  • end_characters str - End characters to look for. Defaults to '. '.

Returns:

  • str - String containing matching blocks of text followed by non-matching

get_matching_then_nonmatching_text

def get_matching_then_nonmatching_text(string_list: List[str],
                                       separator: str = "",
                                       match_min_size: int = 30,
                                       ignore: str = "",
                                       end_characters: str = ".!\r\n") -> str

[view_source]

Returns a string containing matching blocks of text in a list of strings followed by non-matching.

Arguments:

  • string_list List[str] - List of strings to match
  • separator str - Separator to add between blocks of text. Defaults to ''.
  • match_min_size int - Minimum block size to match on. Defaults to 30.
  • ignore str - Any characters to ignore in matching. Defaults to ''.
  • end_characters str - End characters to look for. Defaults to '. '.

Returns:

  • str - String containing matching blocks of text followed by non-matching

number_format

def number_format(val: Any,
                  format: str = "%.4f",
                  trailing_zeros: bool = True) -> str

[view_source]

Format float-castable input as string.

Arguments:

  • val float - Number to format
  • format str - Format to use. Defaults to %.4f.
  • trailing_zeros bool - Leave trailing zeros. Defaults to True.

Returns:

  • str - Formatted number as string

get_fraction_str

def get_fraction_str(numerator: Any,
                     denominator: Optional[Any] = None,
                     format: str = "%.4f",
                     trailing_zeros: bool = True) -> str

[view_source]

Given float-castable numerator and optional float-castable denominator, format as string, returning '' for invalid numerator or 0 denominator.

Arguments:

  • numerator float - Numerator
  • denominator Optional[float] - Denominator. Defaults to None.
  • format str - Format to use. Defaults to %.4f.
  • trailing_zeros bool - Leave trailing zeros. Defaults to True.

Returns:

  • str - Formatted number as string

only_allowed_in_str

def only_allowed_in_str(test_str: str, allowed_chars: Set) -> bool

[view_source]

Returns True if test string contains only allowed characters, False if not.

Arguments:

  • test_str str - Test string
  • allowed_chars Set - Set of allowed characters

Returns:

  • bool - True if test string contains only allowed characters, False if not

get_numeric_if_possible

def get_numeric_if_possible(value: Any) -> Any

[view_source]

Return val if it is not a string, otherwise see if it can be cast to float or int, taking into account commas and periods.

Arguments:

  • value Any - Value

Returns:

  • Any - Value

earliest_index

def earliest_index(string_to_search: str,
                   strings_to_try: ListTuple[str]) -> Optional[int]

[view_source]

Search a string for each of a list of strings and return the earliest index.

Arguments:

  • string_to_search str - String to search
  • strings_to_try ListTuple[str] - Strings to try

Returns:

  • Optional[int] - Earliest index of the strings to try in string to search or None

match_template_variables

def match_template_variables(
        string: str) -> Tuple[Optional[str], Optional[str]]

[view_source]

Try to match {{XXX}} in input string.

Arguments:

  • string str - String in which to look for template

Returns:

Tuple[Optional[str], Optional[str]]: (Matched string with brackets, matched string without brackets)