Title: | FlashText Algorithm for Finding and Replacing Words |
---|---|
Description: | Implementation of the FlashText algorithm, by Singh (2017) <arXiv:1711.00046>. It can be used to find and replace words in a given text with only one pass over the document. |
Authors: | Abraham Jaimes [aut, cre] |
Maintainer: | Abraham Jaimes <[email protected]> |
License: | MIT + file LICENSE |
Version: | 1.0.0.9000 |
Built: | 2025-01-24 03:39:02 UTC |
Source: | https://github.com/abrja/rflashtext |
Based on the python library flashtext. To see more details about the algorithm visit: FlashText
new()
keyword_processor$new( ignore_case = TRUE, word_chars = c(letters, LETTERS, 0:9, "_"), dict = NULL )
ignore_case
logical. If FALSE
the search is case sensitive. Default TRUE
.
word_chars
character vector. Used to validate if a word continues. Default c(letters, LETTERS, 0:9, "_")
equivalent to [a-zA-Z0-9_]
.
dict
list. Internally built character by character and needed for the search. Recommended to let the default value NULL
.
invisible. Assign to a variable to inspect the output. Logical. TRUE
if all went good.
library(rflashtext) processor <- keyword_processor$new(ignore_case = FALSE, word_chars = letters) processor
show_attrs()
keyword_processor$show_attrs(attrs = "all")
attrs
character vector. Options are subsets of c("all", "id", "word_chars", "dict", "ignore_case", "dict_size")
. Default "all"
.
list with the values of the attrs
. Useful to save dict
and reuse it or to check the dict_size
.
library(rflashtext) processor <- keyword_processor$new() processor$add_keys_words(keys = c("NY", "LA"), words = c("New York", "Los Angeles")) processor$show_attrs(attrs = "dict_size") processor$show_attrs(attrs = "dict")
add_keys_words()
keyword_processor$add_keys_words(keys, words = NULL)
keys
character vector. Strings to identify (find/replace) in the text.
words
character vector. Strings to be returned (find) or replaced (replace) when found the respective keys
. Should have the same length as keys
. If not provided, words = keys
.
invisible. Assign to a variable to inspect the output. Logical vector. FALSE
if keys
are duplicated, the respective words
will be updated.
library(rflashtext) processor <- keyword_processor$new() processor$add_keys_words(keys = c("NY", "LA"), words = c("New York", "Los Angeles")) correct <- processor$add_keys_words(keys = c("NY", "CA"), words = c("New York City", "California")) # To check if there are duplicate keys correct
contain_keys()
keyword_processor$contain_keys(keys)
keys
character vector. Strings to check if already are on the search dictionary.
logical vector. TRUE
if the keys
are on the search dictionary.
library(rflashtext) processor <- keyword_processor$new() processor$add_keys_words(keys = c("NY", "LA"), words = c("New York", "Los Angeles")) processor$contain_keys(keys = c("NY", "LA", "TX"))
get_words()
keyword_processor$get_words(keys)
keys
character vector. Strings to get back the respective words
.
character vector. Respective words
. If keys
not found returns NA_character_
.
library(rflashtext) processor <- keyword_processor$new() processor$add_keys_words(keys = c("NY", "LA"), words = c("New York", "Los Angeles")) processor$get_words(keys = c("NY", "LA", "TX"))
find_keys()
keyword_processor$find_keys(sentence, span_info = TRUE)
sentence
character. Text to find the keys
previously defined. Not vectorized.
span_info
logical. TRUE
to retrieve the words
and the position of the matches. FALSE
to only retrieve the words
. Default TRUE
.
list with the words
corresponding to keys
found in the sentence
. Hint: Use do.call(rbind, ...)
to transform the list to a matrix.
library(rflashtext) processor <- keyword_processor$new() processor$add_keys_words(keys = c("NY", "LA"), words = c("New York", "Los Angeles")) words_found <- processor$find_keys(sentence = "I live in LA but I like NY") do.call(rbind, words_found)
replace_keys()
keyword_processor$replace_keys(sentence)
sentence
character. Text to replace the keys
found by the corresponding words
. Not vectorized.
character. Text with the keys
replaced by the respective words
.
library(rflashtext) processor <- keyword_processor$new() processor$add_keys_words(keys = c("NY", "LA"), words = c("New York", "Los Angeles")) new_sentence <- processor$replace_keys(sentence = "I live in LA but I like NY") new_sentence
library(rflashtext) processor <- keyword_processor$new() processor$add_keys_words(keys = c("NY", "LA"), words = c("New York", "Los Angeles")) processor$contain_keys(keys = "NY") processor$get_words(keys = "LA") processor$find_keys(sentence = "I live in LA but I like NY") processor$replace_keys(sentence = "I live in LA but I like NY") ## ------------------------------------------------ ## Method `keyword_processor$new` ## ------------------------------------------------ library(rflashtext) processor <- keyword_processor$new(ignore_case = FALSE, word_chars = letters) processor ## ------------------------------------------------ ## Method `keyword_processor$show_attrs` ## ------------------------------------------------ library(rflashtext) processor <- keyword_processor$new() processor$add_keys_words(keys = c("NY", "LA"), words = c("New York", "Los Angeles")) processor$show_attrs(attrs = "dict_size") processor$show_attrs(attrs = "dict") ## ------------------------------------------------ ## Method `keyword_processor$add_keys_words` ## ------------------------------------------------ library(rflashtext) processor <- keyword_processor$new() processor$add_keys_words(keys = c("NY", "LA"), words = c("New York", "Los Angeles")) correct <- processor$add_keys_words(keys = c("NY", "CA"), words = c("New York City", "California")) # To check if there are duplicate keys correct ## ------------------------------------------------ ## Method `keyword_processor$contain_keys` ## ------------------------------------------------ library(rflashtext) processor <- keyword_processor$new() processor$add_keys_words(keys = c("NY", "LA"), words = c("New York", "Los Angeles")) processor$contain_keys(keys = c("NY", "LA", "TX")) ## ------------------------------------------------ ## Method `keyword_processor$get_words` ## ------------------------------------------------ library(rflashtext) processor <- keyword_processor$new() processor$add_keys_words(keys = c("NY", "LA"), words = c("New York", "Los Angeles")) processor$get_words(keys = c("NY", "LA", "TX")) ## ------------------------------------------------ ## Method `keyword_processor$find_keys` ## ------------------------------------------------ library(rflashtext) processor <- keyword_processor$new() processor$add_keys_words(keys = c("NY", "LA"), words = c("New York", "Los Angeles")) words_found <- processor$find_keys(sentence = "I live in LA but I like NY") do.call(rbind, words_found) ## ------------------------------------------------ ## Method `keyword_processor$replace_keys` ## ------------------------------------------------ library(rflashtext) processor <- keyword_processor$new() processor$add_keys_words(keys = c("NY", "LA"), words = c("New York", "Los Angeles")) new_sentence <- processor$replace_keys(sentence = "I live in LA but I like NY") new_sentence
library(rflashtext) processor <- keyword_processor$new() processor$add_keys_words(keys = c("NY", "LA"), words = c("New York", "Los Angeles")) processor$contain_keys(keys = "NY") processor$get_words(keys = "LA") processor$find_keys(sentence = "I live in LA but I like NY") processor$replace_keys(sentence = "I live in LA but I like NY") ## ------------------------------------------------ ## Method `keyword_processor$new` ## ------------------------------------------------ library(rflashtext) processor <- keyword_processor$new(ignore_case = FALSE, word_chars = letters) processor ## ------------------------------------------------ ## Method `keyword_processor$show_attrs` ## ------------------------------------------------ library(rflashtext) processor <- keyword_processor$new() processor$add_keys_words(keys = c("NY", "LA"), words = c("New York", "Los Angeles")) processor$show_attrs(attrs = "dict_size") processor$show_attrs(attrs = "dict") ## ------------------------------------------------ ## Method `keyword_processor$add_keys_words` ## ------------------------------------------------ library(rflashtext) processor <- keyword_processor$new() processor$add_keys_words(keys = c("NY", "LA"), words = c("New York", "Los Angeles")) correct <- processor$add_keys_words(keys = c("NY", "CA"), words = c("New York City", "California")) # To check if there are duplicate keys correct ## ------------------------------------------------ ## Method `keyword_processor$contain_keys` ## ------------------------------------------------ library(rflashtext) processor <- keyword_processor$new() processor$add_keys_words(keys = c("NY", "LA"), words = c("New York", "Los Angeles")) processor$contain_keys(keys = c("NY", "LA", "TX")) ## ------------------------------------------------ ## Method `keyword_processor$get_words` ## ------------------------------------------------ library(rflashtext) processor <- keyword_processor$new() processor$add_keys_words(keys = c("NY", "LA"), words = c("New York", "Los Angeles")) processor$get_words(keys = c("NY", "LA", "TX")) ## ------------------------------------------------ ## Method `keyword_processor$find_keys` ## ------------------------------------------------ library(rflashtext) processor <- keyword_processor$new() processor$add_keys_words(keys = c("NY", "LA"), words = c("New York", "Los Angeles")) words_found <- processor$find_keys(sentence = "I live in LA but I like NY") do.call(rbind, words_found) ## ------------------------------------------------ ## Method `keyword_processor$replace_keys` ## ------------------------------------------------ library(rflashtext) processor <- keyword_processor$new() processor$add_keys_words(keys = c("NY", "LA"), words = c("New York", "Los Angeles")) new_sentence <- processor$replace_keys(sentence = "I live in LA but I like NY") new_sentence
Based on the python library flashtext. To see more details about the algorithm visit: FlashText
attrs
list. Stores the attributes of the KeywordProcessor
object.
new()
Initializes the KeywordProcessor
object.
KeywordProcessor$new( keys = NULL, words = NULL, trie = NULL, id = "_word_", chars = paste0(c(letters, LETTERS, 0:9, "_"), collapse = ""), ignore_case = FALSE )
keys
character vector. Strings to identify (find/replace) in the text. Must be provided if trie
is NULL
.
words
character vector. Strings to be returned (find) or replaced (replace) when found the respective keys
. Should have the same length as keys
. If not provided, words = keys
.
trie
character. JSON built character by character and needed for the search. It can be provided instead of keys
and words
.
id
character. Used to name the end nodes of the trie
dictionary.
chars
character. Used to validate if a word continues. Default paste0(c(letters, LETTERS, 0:9, "_"), collapse = "")
equivalent to [a-zA-Z0-9_]
.
ignore_case
logical. If FALSE
the search is case sensitive. Default TRUE
.
library(rflashtext) processor <- KeywordProcessor$new(keys = c("NY", "LA"), words = c("New York", "Los Angeles")) processor$attrs
library(rflashtext) processor <- KeywordProcessor$new(chars = paste0(letters, collapse = ""), keys = c("NY", "LA")) processor$attrs
show_trie()
Shows the trie
dictionary used to find/replace keys
.
KeywordProcessor$show_trie()
character. JSON string of the trie
structure. It can be converted to list using jsonlite::fromJSON
.
library(rflashtext) processor <- KeywordProcessor$new(keys = c("NY", "LA"), words = c("New York", "Los Angeles")) processor$show_trie()
add_keys_words()
Adds keys
and words
to the trie
dictionary.
KeywordProcessor$add_keys_words(keys, words = NULL)
keys
character vector. Strings to identify (find/replace) in the text.
words
character vector. Strings to be returned (find) or replaced (replace) when found the respective keys
. Should have the same length as keys
. If not provided, words = keys
.
library(rflashtext) processor <- KeywordProcessor$new(keys = c("NY", "LA"), words = c("New York", "Los Angeles")) processor$add_keys_words(keys = "CA", words = "California") processor$show_trie()
contain_keys()
Checks if keys
are in the trie
dictionary.
KeywordProcessor$contain_keys(keys)
keys
character vector. Strings to check if already are in the search trie
dictionary.
logical vector. TRUE
if the keys
are in the search trie
dictionary.
library(rflashtext) processor <- KeywordProcessor$new(keys = c("NY", "LA"), words = c("New York", "Los Angeles")) processor$contain_keys(keys = c("NY", "LA", "TX"))
get_words()
Gets the words
for the keys
found in the trie
dictionary.
KeywordProcessor$get_words(keys)
keys
character vector. Strings to get back the respective words
.
character vector. Respective words
. If keys
not found returns NA_character_
.
library(rflashtext) processor <- KeywordProcessor$new(keys = c("NY", "LA"), words = c("New York", "Los Angeles")) processor$get_words(keys = c("NY", "LA", "TX"))
find_keys()
Finds keys
in the sentences using the search trie
dictionary.
KeywordProcessor$find_keys(sentences, span_info = TRUE)
sentences
character vector. Text to find the keys
previously defined.
span_info
logical. TRUE
to retrieve the words
and the position of the matches. FALSE
to only retrieve the words
. Default TRUE
.
list with the words
corresponding to keys
found in the sentence
. Hint: Use data.table::rbindlist(...)
to transform the list to a data frame.
library(rflashtext) processor <- KeywordProcessor$new(keys = c("NY", "LA"), words = c("New York", "Los Angeles")) words_found <- processor$find_keys(sentences = "I live in LA but I like NY") words_found
replace_keys()
Replaces keys
found in the sentences by the corresponding words
.
KeywordProcessor$replace_keys(sentences)
sentences
character vector. Text to replace the keys
found by the corresponding words
.
character vector. Text with the keys
replaced by the respective words
.
library(rflashtext) processor <- KeywordProcessor$new(keys = c("NY", "LA"), words = c("New York", "Los Angeles")) new_sentences <- processor$replace_keys(sentences = "I live in LA but I like NY") new_sentences
library(rflashtext) processor <- KeywordProcessor$new(keys = c("NY", "LA"), words = c("New York", "Los Angeles")) processor$contain_keys(keys = "NY") processor$get_words(keys = "LA") processor$find_keys(sentences = "I live in LA but I like NY") processor$replace_keys(sentences = "I live in LA but I like NY") ## ------------------------------------------------ ## Method `KeywordProcessor$new` ## ------------------------------------------------ library(rflashtext) processor <- KeywordProcessor$new(keys = c("NY", "LA"), words = c("New York", "Los Angeles")) processor$attrs library(rflashtext) processor <- KeywordProcessor$new(chars = paste0(letters, collapse = ""), keys = c("NY", "LA")) processor$attrs ## ------------------------------------------------ ## Method `KeywordProcessor$show_trie` ## ------------------------------------------------ library(rflashtext) processor <- KeywordProcessor$new(keys = c("NY", "LA"), words = c("New York", "Los Angeles")) processor$show_trie() ## ------------------------------------------------ ## Method `KeywordProcessor$add_keys_words` ## ------------------------------------------------ library(rflashtext) processor <- KeywordProcessor$new(keys = c("NY", "LA"), words = c("New York", "Los Angeles")) processor$add_keys_words(keys = "CA", words = "California") processor$show_trie() ## ------------------------------------------------ ## Method `KeywordProcessor$contain_keys` ## ------------------------------------------------ library(rflashtext) processor <- KeywordProcessor$new(keys = c("NY", "LA"), words = c("New York", "Los Angeles")) processor$contain_keys(keys = c("NY", "LA", "TX")) ## ------------------------------------------------ ## Method `KeywordProcessor$get_words` ## ------------------------------------------------ library(rflashtext) processor <- KeywordProcessor$new(keys = c("NY", "LA"), words = c("New York", "Los Angeles")) processor$get_words(keys = c("NY", "LA", "TX")) ## ------------------------------------------------ ## Method `KeywordProcessor$find_keys` ## ------------------------------------------------ library(rflashtext) processor <- KeywordProcessor$new(keys = c("NY", "LA"), words = c("New York", "Los Angeles")) words_found <- processor$find_keys(sentences = "I live in LA but I like NY") words_found ## ------------------------------------------------ ## Method `KeywordProcessor$replace_keys` ## ------------------------------------------------ library(rflashtext) processor <- KeywordProcessor$new(keys = c("NY", "LA"), words = c("New York", "Los Angeles")) new_sentences <- processor$replace_keys(sentences = "I live in LA but I like NY") new_sentences
library(rflashtext) processor <- KeywordProcessor$new(keys = c("NY", "LA"), words = c("New York", "Los Angeles")) processor$contain_keys(keys = "NY") processor$get_words(keys = "LA") processor$find_keys(sentences = "I live in LA but I like NY") processor$replace_keys(sentences = "I live in LA but I like NY") ## ------------------------------------------------ ## Method `KeywordProcessor$new` ## ------------------------------------------------ library(rflashtext) processor <- KeywordProcessor$new(keys = c("NY", "LA"), words = c("New York", "Los Angeles")) processor$attrs library(rflashtext) processor <- KeywordProcessor$new(chars = paste0(letters, collapse = ""), keys = c("NY", "LA")) processor$attrs ## ------------------------------------------------ ## Method `KeywordProcessor$show_trie` ## ------------------------------------------------ library(rflashtext) processor <- KeywordProcessor$new(keys = c("NY", "LA"), words = c("New York", "Los Angeles")) processor$show_trie() ## ------------------------------------------------ ## Method `KeywordProcessor$add_keys_words` ## ------------------------------------------------ library(rflashtext) processor <- KeywordProcessor$new(keys = c("NY", "LA"), words = c("New York", "Los Angeles")) processor$add_keys_words(keys = "CA", words = "California") processor$show_trie() ## ------------------------------------------------ ## Method `KeywordProcessor$contain_keys` ## ------------------------------------------------ library(rflashtext) processor <- KeywordProcessor$new(keys = c("NY", "LA"), words = c("New York", "Los Angeles")) processor$contain_keys(keys = c("NY", "LA", "TX")) ## ------------------------------------------------ ## Method `KeywordProcessor$get_words` ## ------------------------------------------------ library(rflashtext) processor <- KeywordProcessor$new(keys = c("NY", "LA"), words = c("New York", "Los Angeles")) processor$get_words(keys = c("NY", "LA", "TX")) ## ------------------------------------------------ ## Method `KeywordProcessor$find_keys` ## ------------------------------------------------ library(rflashtext) processor <- KeywordProcessor$new(keys = c("NY", "LA"), words = c("New York", "Los Angeles")) words_found <- processor$find_keys(sentences = "I live in LA but I like NY") words_found ## ------------------------------------------------ ## Method `KeywordProcessor$replace_keys` ## ------------------------------------------------ library(rflashtext) processor <- KeywordProcessor$new(keys = c("NY", "LA"), words = c("New York", "Los Angeles")) new_sentences <- processor$replace_keys(sentences = "I live in LA but I like NY") new_sentences