Import texthero ...
import texthero as hero
import pandas as pd
... load any text dataset with Pandas
df = pd.read_csv(
"https://github.com/jbesomi/texthero/raw/master/dataset/bbcsport.csv"
)
df.head(2)
| text | topic |
---|
0 | Claxton hunting first major medal\n\nBritish h... | athletics |
---|
1 | O'Sullivan could run in Worlds\n\nSonia O'Sull... | athletics |
---|
Preprocess it ...
df['text'] = hero.clean(df['text'])
| text | topic |
---|
0 | claxton hunting first major medal british hurd... | athletics |
---|
1 | sullivan could run worlds sonia sullivan indic... | athletics |
---|
... represent it
df['tfidf'] = (
hero.tfidf(df['text'], max_features=100)
)
df[["tfidf", "topic"]].head(2)
| tfidf | topic |
---|
0 | [0.0, 0.13194458247285848, 0.0, 0.0, 0.0, 0.0,... | athletics |
---|
1 | [0.0, 0.13056235989725676, 0.0, 0.205187581391... | athletics |
---|
Reduce dimension and visualize the vector space
df['pca'] = hero.pca(df['tfidf'])
hero.scatterplot(
df,
col='pca',
color='topic',
title="PCA BBC Sport news"
)
... need more? find named entities
df['named_entities'] = (
hero.named_entities(df['text'])
)
df[['named_entities', 'topic']].head(2)
| named_entities | topic |
---|
0 | [(claxton, ORG, 0, 7), (first, ORDINAL, 16, 21... | athletics |
---|
1 | [(sullivan, ORG, 0, 8), (sonia sullivan, PERSO... | athletics |
---|
Show top words ...
NUM_TOP_WORDS = 5
hero.top_words(df['text'])[:NUM_TOP_WORDS]
| text |
---|
said | 1338 |
---|
first | 790 |
---|
england | 749 |
---|
game | 681 |
---|
one | 671 |
---|
And much more !