In [1]:
!pip install torch_nightly -f https://download.pytorch.org/whl/nightly/cu92/torch_nightly.html
!pip install fastai

Looking in links: https://download.pytorch.org/whl/nightly/cu92/torch_nightly.html
Collecting torch_nightly
[?25l  Downloading https://download.pytorch.org/whl/nightly/cu92/torch_nightly-1.2.0.dev20190805%2Bcu92-cp36-cp36m-linux_x86_64.whl (704.8MB)
[K     |████████████████████████████████| 704.8MB 25kB/s 
[?25hInstalling collected packages: torch-nightly
Successfully installed torch-nightly-1.2.0.dev20190805+cu92


In [0]:
# import libraries
import fastai
from fastai import *
from fastai.text import * 
import pandas as pd
import numpy as np
from functools import partial
import io
import os

In [3]:
from sklearn.datasets import fetch_20newsgroups
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [4]:
df = pd.DataFrame({'label':dataset.target, 'text':dataset.data})
df.shape
df

Unnamed: 0,label,text
0,17,Well i'm not sure about the story nad it did s...
1,0,"\n\n\n\n\n\n\nYeah, do you expect people to re..."
2,17,Although I realize that principle is not one o...
3,11,Notwithstanding all the legitimate fuss about ...
4,10,"Well, I will have to change the scoring on my ..."
...,...,...
11309,17,"Danny Rubenstein, an Israeli journalist, will ..."
11310,13,\n
11311,9,\nI agree. Home runs off Clemens are always m...
11312,4,I used HP DeskJet with Orange Micros Grappler ...


In [5]:
#  ‘comp.graphics’ and ‘rec.sport.hockey’, 
df = df[df['label'].isin([1,10])]
df = df.reset_index(drop = True)
df['label'].value_counts()

10    600
1     584
Name: label, dtype: int64

In [6]:
df['text'] = df['text'].str.replace("[^a-zA-Z]", " ")

import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords 
stop_words = stopwords.words('english')

# tokenization 
tokenized_doc = df['text'].apply(lambda x: x.split())

# remove stop-words 
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])

# de-tokenization 
detokenized_doc = [] 
for i in range(len(df)): 
    t = ' '.join(tokenized_doc[i]) 
    detokenized_doc.append(t) 

df['text'] = detokenized_doc

from sklearn.model_selection import train_test_split

# split data into training and validation set
df_trn, df_val = train_test_split(df, stratify = df['label'], test_size = 0.4, random_state = 12)
df_trn.shape, df_val.shape

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


((710, 2), (474, 2))

In [7]:
# Language model data
data_lm = TextLMDataBunch.from_df(train_df = df_trn, valid_df = df_val, path = "")

# Classifier model data
data_clas = TextClasDataBunch.from_df(path = "", train_df = df_trn, valid_df = df_val, vocab=data_lm.train_ds.vocab, bs=32)

In [8]:
learn = language_model_learner(data_lm, AWD_LSTM, drop_mult=0.5)
learn.fit_one_cycle(1, 1e-2)

Downloading https://s3.amazonaws.com/fast-ai-modelzoo/wt103-fwd


epoch,train_loss,valid_loss,accuracy,time
0,6.024404,5.180079,0.247688,00:02


In [9]:
learn.unfreeze()
learn.fit_one_cycle(1, 1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,5.096985,4.798,0.279241,00:03


In [10]:
learn.predict("This is a review about Data Science", n_words=15)

'This is a review about Data Science Brown Calculating mathematically BLUE wales an strange quality x Black'

'This is a review about a British system It appears bergman systems people know younger generation could even around something around admission It didn like time since people do techno making those get query would What one type i interested recommend trying compatibility review The way claim postings users decide whether single NHL format lha waste memory European etc It relative concern single different NHL Assembly FP certain sections necessary Believe virtual worlds not house category This fourth idea is unlikely something public obtaining even very important Gretzky people qualified one'

In [0]:
learn.save_encoder('ft_enc')

In [12]:
learn = text_classifier_learner(data_clas,  AWD_LSTM, drop_mult=0.5)
learn.load_encoder('ft_enc')

RNNLearner(data=TextClasDataBunch;

Train: LabelList (710 items)
x: TextList
xxbos xxmaj it looks like xxmaj edmonton xxmaj oilers decided take xxmaj european xxunk spring xxmaj ranford xxmaj tugnutt xxmaj benning xxmaj manson xxmaj smith xxmaj buchberger xxmaj corson playing xxmaj canada xxmaj podein xxmaj weight playing xxup us xxmaj is xxmaj kravchuk playing xxmaj xxunk i know nagging injuries late season xxmaj podein interesting case eligible play xxmaj cape xxmaj breton xxup ahl playoffs like xxmaj kovalev xxmaj zubov xxmaj andersson obviously xxmaj sather xxmaj pocklington total xxunk everyone makes certainly case massively xxunk xxmaj paramount xxmaj new xxmaj york xxmaj rangers,xxbos xxmaj this xxunk xxmaj speaking die hard i i read xxunk hard xxunk xxmaj toronto xxmaj cup finals xxmaj first anyone planet heard team xxmaj detroit xxmaj al xxmaj xxunk however spell idiot name must xxmaj chicago xxup espn said even close xxmaj chicago xxunk win xxmaj norris xxmaj division xxmaj p

In [13]:
learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,0.270112,0.158374,0.957806,00:07


In [14]:
learn.freeze_to(-2)
learn.fit_one_cycle(1, slice(5e-3/2., 5e-3))

epoch,train_loss,valid_loss,accuracy,time
0,0.234733,0.136975,0.951477,00:08


In [15]:
learn.unfreeze()
learn.fit_one_cycle(1, slice(2e-3/100, 2e-3))

epoch,train_loss,valid_loss,accuracy,time
0,0.130992,0.104516,0.957806,00:10


In [16]:
# get predictions
preds, targets = learn.get_preds()

predictions = np.argmax(preds, axis = 1)
pd.crosstab(predictions, targets)

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,219,5
1,15,235


In [0]:
from sklearn.metrics import f1_score

In [18]:
f1_score(targets, predictions, average='macro')
# f1_score(targets, predictions, average='micro')
# f1_score(targets, predictions, average='weighted')

0.9577577755993227