Removing stopwords is a standard NLP task. If we have N texts, how can I use multi processing in Python to, given a common list stopwords
remove stopwords in parallel from the N texts? Normally one does something like
stopwords = ["...", ...]. #whatever list of stopwords here
def remove_stopwords(text):
return [w for w in text.split() if w not in stopwords]
If we were to do multiprocessing, we would need to copy on each process stopwords
. This is an expensive task, as copying would take much more time than executing remove_stopwords
. How can I have all the processes access a shared instance of the Python list stopwords
so that I don't have to spend time copying?
This is what I tried. The shared memory array actually spends a lot of time (I have never been able to complete this computation):
import random
import time
import spacy
from multiprocessing.sharedctypes import Array
from ctypes import Structure, c_wchar_p
from multiprocessing import Pool
class Word(Structure):
_fields_ = [('text', c_wchar_p)]
chars = "abcdefghijklmnopqrstuvxyzáéíóúñABCDEFGIJKLMNOPQRSTUVXYZ"
def generate_vocab(n_words, char_mean=8, char_std=3):
words = []
for x in range(n_words):
k = max(1, round(random.gauss(char_mean, char_std)))
word = "".join(random.choices(chars, k=k))
words.append(word)
return words
def get_texts(n_texts, n_words, vocab):
return [" ".join(random.choices(vocab, k=n_words)) for i in range(n_texts)]
def remove_stopword(doc, sw):
return " ".join([token for token in doc.split() if token not in sw])
vocab = generate_vocab(500)
trial_texts = get_texts(5000, 500, vocab)
stop_words = vocab[:30]
common_stopwords = Array(Word, [(word,) for word in stop_words])
st = time.time()
clean_texts = [remove_stopword(doc, sw=stop_words) for doc in trial_texts]
print(time.time()-st, "to clean them in series")
def rm_stopword_usual(doc):
remove_stopword(doc, stop_words)
def rm_stopword_shared(doc):
remove_stopword(doc, common_stopwords)
st = time.time()
with Pool(processes=8) as p:
clean_texts = p.map(rm_stopword_usual, trial_texts)
print(time.time() - st, "to clean them in parallel copying stopwords")
st = time.time()
with Pool(processes=8) as p:
clean_texts = p.map(rm_stopword_shared, trial_texts)
print(time.time() - st, "to clean them in parallel with shared stopwords")
The partial output that I get is
1.2284278869628906 to clean them in series
0.3515629768371582 to clean them in parallel copying stopwords