Social News in 1000 Steps – Step 3

This entry is part 3 of 14 in the series Social News

In the previous blogs I introduced a “query by example” like approach. Both the “query” part and the “example” part need a lot of work. This time I give (modified) code to store documents in a corpus with a multiprocessing pool.

The following code contains the changes for step 3:

import feedparser as fp
import newspaper
import pandas as pd
import dautil as dl
import datetime
from multiprocessing_on_dill import Pool
import listparser
import os
import numpy as np
def select(arr):
    return np.percentile(arr, 99.5)
def get_terms(alist):
    df = dl.nlp.calc_tfidf(alist)
    return dl.nlp.select_terms(df, method=None, select_func=select)
def init_config():
    config = newspaper.Config()
    config.fetch_images = False
    config.verbose = True
    return config
def process_url(entry):
        if in corpus.url_set:
        a = newspaper.Article(, config)
        corpus.store_text(entry.title.replace(' ', '_'),
                          a.text,, entry.title,
                          " ".join([author.lower()
                                    for author in a.authors]))
        log.debug('Link={0} len(text)={1}'.format(
  , len(a.text)))
    except newspaper.article.ArticleException as e:
        log.warning('{0} {1}'.format(, e))
if __name__ == "__main__":
    urls = ['',
    for f in os.listdir('opml'):
        if f.endswith('opml'):
            fname = os.path.join('opml', f)
            parsed_opml = listparser.parse(fname)
            urls.extend([feed.url for feed in parsed_opml.feeds])
    log = dl.log_api.conf_logger(__name__)
    config = init_config()
    corpus = dl.nlp.WebCorpus('sonar_corpus')
    entries = []
    for url in set(urls):
        rss = fp.parse(url)
        for entry in set(rss.entries):
    with Pool(8) as p:, entries)
    texts = corpus.get_texts()
    text_terms = get_terms(texts)
    title_terms = get_terms(corpus.get_titles())
    terms = text_terms.intersection(title_terms) - corpus.get_authors()
    fname = 'keywords.csv'
    old = set(pd.read_csv(fname)['Term'].values.tolist())
    with open(fname, 'a') as csv_file:
        for t in terms:
            if t not in old:
                ts =
                csv_file.write(ts + ',' + t + ',Use\n')
Series NavigationSocial News in 1000 Steps – Step 2Social News in 1000 Steps – Step 4
By the author of NumPy Beginner's Guide, NumPy Cookbook and Instant Pygame. If you enjoyed this post, please consider leaving a comment or subscribing to the RSS feed to have future articles delivered to your feed reader.
This entry was posted in Uncategorized. Bookmark the permalink.