# plotly standard imports
import plotly.graph_objs as go
import chart_studio.plotly as py

# Cufflinks wrapper on plotly
import cufflinks

# Data science imports
import pandas as pd
import numpy as np

# Options for pandas
pd.options.display.max_columns = 30

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from plotly.offline import iplot, init_notebook_mode
cufflinks.go_offline(connected=True)
init_notebook_mode(connected=True)

# Set global theme
cufflinks.set_config_file(world_readable=True, theme='pearl')

import tensorflow_datasets as tfds

Let's explore datasets¶

from src.datasets import download

train_data, test_data = download(display_train_progress=True)

Humanize labels¶

Labels can be 0, 0.5, 1. From bad to good sentimen.

Will map them to correct words for easier exploring

label_categories = ['bad', 'neutral', 'good']

def humanize_label(x):
    return label_categories[int(x * 2)]

Training data distribution¶

train_df = pd.DataFrame(tfds.as_numpy(train_data), columns=['text', 'type'])

train_df['type'] = train_df['type'].apply(humanize_label)

train_df.head()

N/A% (0 of 1600) |                       | Elapsed Time: 0:00:00 ETA:  --:--:--

Start reading dataset from ./data/training.1600000.processed.noemoticon.csv

100% (1600 of 1600) |####################| Elapsed Time: 0:07:46 Time:  0:07:46

print('Training dataset records', len(train_df.index))

train_df['type'].iplot(
    kind='hist',
    yTitle='count',
    xTitle='Type',
    title='Training data distribution'
)

Training dataset records 1600000

Testing data distribution¶

test_df = pd.DataFrame(tfds.as_numpy(test_data), columns=['text', 'type'])

test_df['type'] = test_df['type'].apply(humanize_label)

test_df[30:40]

Start reading dataset from ./data/testdata.manual.2009.06.14.csv

print('Testing dataset records', len(test_df.index))

neutralSeries = test_df.apply(lambda x: True if x['type'] == 'neutral' else False, axis=1)
print('Count of neutral rows', len(neutralSeries[neutralSeries == True].index))

test_df['type'].iplot(
    kind='hist',
    yTitle='count',
    xTitle='Type',
    title='Testing data distribution'
)

Testing dataset records 498
Count of neutral rows 139

Normalize text¶

Preprocess text¶

Need remove special symbols.
Replace usernames and links with readable words.
Split hashtags for stay they meaning

from src.normalize.normalize_text import preprocess_text

preprocess_text('Best awards øøøfor http://something.io/slpha ~and_ @futurer #thebest?')

JamSpell model loaded successfully

'Best awards for link and username thebest?'

Replace misspels¶

Need replace misspells for decrease vocabularity size and improve network results.

from src.normalize.replace_misspells import replace_misspells

replace_misspells('Berst awwwards for link and username the best')

'Best awards for link and username the best'

Replace contractions¶

Contractions are words that we write with an apostrophe. Examples of contractions are words like “ain’t” or “aren’t”.

For standartize text better replace them

from src.normalize.replace_contractions import replace_contractions

replace_contractions([
    "I'm a text with contraction, which can't be' easilly 'parsed' by NN",
    "This's unexpected for pycontractions, possible can be fixed by changenging word corpus"
])

["I am a text with contraction, which cannot be' easilly 'parsed' by NN",
 "This's unexpected for pycontractions, possible can be fixed by changenging word corpus"]

Lemmatize words¶

Will replace words with root form for decrease vocabulirity size

from src.normalize.lemmatization import lematize

[lematize(word) for word in ['changing', 'connected', 'us', 'back']]

['change', 'connect', 'us', 'back']

Remove stopwords¶

Stopwords not give actual meaning but create noize in processing

from src.normalize.remove_stopwords import is_stopword

[word + ' - ' + str(is_stopword(word)) for word in ['is', 'word', 'be', 'a', 'super', 'still', 'up', 'this','too', 'much', 'nothing', 'where', 'everyone', 'very', 'down', 'last', 'ok', 'good', 'it', 'back', 'empty', 'anyone', 'so', 'why', 'my', 'already', 'us']]

['is - True',
 'word - False',
 'be - True',
 'a - True',
 'super - False',
 'still - False',
 'up - False',
 'this - False',
 'too - False',
 'much - False',
 'nothing - False',
 'where - False',
 'everyone - False',
 'very - False',
 'down - False',
 'last - False',
 'ok - False',
 'good - False',
 'it - False',
 'back - True',
 'empty - True',
 'anyone - True',
 'so - False',
 'why - False',
 'my - False',
 'already - False',
 'us - True']

Replace numbers¶

Will replace numbers with #, it allow remove all possible numbers from text, but have they meaning

from src.normalize.clean_text import replace_numbers

replace_numbers('I have $1 billion, but they only in my imagination. 1 billiion > 500 thouthands')

'I have $ # billion, but they only in my imagination. # billiion > ### thouthands'

Remove continiuse dublications¶

In case when author add dublicated words or punctuation for increase expression

from src.normalize.clean_text import remove_continiues_dublications

remove_continiues_dublications("very very cool ! ! !".split())

['very', 'cool', '!']

Result normalization¶

from src.normalize.normalize_text import normalize_text

normalize_text("Barrichello to win the #f1 today???. I really want Kubica to place, he's a fantastic driver. Damn, why can't I watch it. In the US is back")

'Barrichello win f # today ? . i want Kubica place he fantastic driver . damn why can not i watch it . in US'

Explore prepared dataset¶

from src.prepare_datasets import load_preprocessed_datasets

train_prep_dataset, test_prep_dataset = load_preprocessed_datasets(display_train_progress=True)

JamSpell model loaded successfully

Check preprocessed training datasets distribution¶

train_prep_df = pd.DataFrame(tfds.as_numpy(train_prep_dataset), columns=['text', 'type'])

train_prep_df['type'] = train_prep_df['type'].apply(humanize_label)

train_prep_df.head()

N/A% (0 of 1600) |                       | Elapsed Time: 0:00:00 ETA:  --:--:--

Start reading dataset from ./preprocessed/training.1600000.processed.noemoticon.csv

100% (1600 of 1600) |####################| Elapsed Time: 0:06:58 Time:  0:06:58

print('Training dataset records', len(train_prep_df.index))

train_prep_df['type'].iplot(
    kind='hist',
    yTitle='count',
    xTitle='Type',
    title='Preprocessed training data distribution'
)

Training dataset records 1599976

Check testing dataset¶

test_prep_df = pd.DataFrame(tfds.as_numpy(test_prep_dataset), columns=['text', 'type'])

test_prep_df['type'] = test_prep_df['type'].apply(humanize_label)

test_prep_df.head()

Start reading dataset from ./preprocessed/testdata.manual.2009.06.14.csv

print('Training dataset records', len(test_prep_df.index))

test_prep_df['type'].iplot(
    kind='hist',
    yTitle='count',
    xTitle='Type',
    title='Preprocessed testing data distribution'
)

Training dataset records 498

Explore vocabulary¶

from src.normalize import load_encoder

encoder, vocab_size = load_encoder()
sorted_vocab = sorted(encoder.tokens)

def show_vocab_tokens(vocab):
    for i in range(len(vocab)):
        word = vocab[i]
        character_numbers = ','.join([str(ord(character)) for character in word])
        print(word, '|', character_numbers, '|', len(word), '\n')

# print(' | '.join(sorted_vocab[90:100]))
show_vocab_tokens(sorted_vocab[400:600])

&quot;something | 38,113,117,111,116,59,115,111,109,101,116,104,105,110,103 | 15 

&quot;sometimes&quot | 38,113,117,111,116,59,115,111,109,101,116,105,109,101,115,38,113,117,111,116 | 20 

&quot;stayed | 38,113,117,111,116,59,115,116,97,121,101,100 | 12 

&quot;stop | 38,113,117,111,116,59,115,116,111,112 | 10 

&quot;stray&quot | 38,113,117,111,116,59,115,116,114,97,121,38,113,117,111,116 | 16 

&quot;take | 38,113,117,111,116,59,116,97,107,101 | 10 

&quot;tan | 38,113,117,111,116,59,116,97,110 | 9 

&quot;tell | 38,113,117,111,116,59,116,101,108,108 | 10 

&quot;that | 38,113,117,111,116,59,116,104,97,116 | 10 

&quot;the | 38,113,117,111,116,59,116,104,101 | 9 

&quot;there | 38,113,117,111,116,59,116,104,101,114,101 | 11 

&quot;they | 38,113,117,111,116,59,116,104,101,121 | 10 

&quot;think | 38,113,117,111,116,59,116,104,105,110,107 | 11 

&quot;thinking | 38,113,117,111,116,59,116,104,105,110,107,105,110,103 | 14 

&quot;this | 38,113,117,111,116,59,116,104,105,115 | 10 

&quot;tht | 38,113,117,111,116,59,116,104,116 | 9 

&quot;today | 38,113,117,111,116,59,116,111,100,97,121 | 11 

&quot;tori!!&quot; | 38,113,117,111,116,59,116,111,114,105,33,33,38,113,117,111,116,59 | 18 

&quot;tweetrita's&quot | 38,113,117,111,116,59,116,119,101,101,116,114,105,116,97,39,115,38,113,117,111,116 | 22 

&quot;u | 38,113,117,111,116,59,117 | 7 

&quot;uved | 38,113,117,111,116,59,117,118,101,100 | 10 

&quot;v | 38,113,117,111,116,59,118 | 7 

&quot;valuable | 38,113,117,111,116,59,118,97,108,117,97,98,108,101 | 14 

&quot;watch | 38,113,117,111,116,59,119,97,116,99,104 | 11 

&quot;we | 38,113,117,111,116,59,119,101 | 8 

&quot;well | 38,113,117,111,116,59,119,101,108,108 | 10 

&quot;what | 38,113,117,111,116,59,119,104,97,116 | 10 

&quot;who | 38,113,117,111,116,59,119,104,111 | 9 

&quot;y | 38,113,117,111,116,59,121 | 7 

&quot;ya'll&quot | 38,113,117,111,116,59,121,97,39,108,108,38,113,117,111,116 | 16 

&quot;yay&quot | 38,113,117,111,116,59,121,97,121,38,113,117,111,116 | 14 

&quot;yeah | 38,113,117,111,116,59,121,101,97,104 | 10 

&quot;yep!&quot | 38,113,117,111,116,59,121,101,112,33,38,113,117,111,116 | 15 

&quot;yes | 38,113,117,111,116,59,121,101,115 | 9 

&quot;you | 38,113,117,111,116,59,121,111,117 | 9 

&quot;your | 38,113,117,111,116,59,121,111,117,114 | 10 

&quot;øøø | 38,113,117,111,116,59,248,159,248,159,248,159 | 12 

' | 39 | 1 

'&lt | 39,38,108,116 | 4 

'&quot | 39,38,113,117,111,116 | 6 

'' | 39,39 | 2 

''michael | 39,39,109,105,99,104,97,101,108 | 9 

''what | 39,39,119,104,97,116 | 6 

')&gt | 39,41,38,103,116 | 5 

'- | 39,45 | 2 

'-&lt;3 | 39,45,38,108,116,59,51 | 7 

'a | 39,97 | 2 

'a. | 39,97,46 | 3 

'after | 39,97,102,116,101,114 | 6 

'blah | 39,98,108,97,104 | 5 

'bossnapping | 39,98,111,115,115,110,97,112,112,105,110,103 | 12 

'bout | 39,98,111,117,116 | 5 

'boy | 39,98,111,121 | 4 

'can | 39,99,97,110 | 4 

'cept | 39,99,101,112,116 | 5 

'chip | 39,99,104,105,112 | 5 

'come | 39,99,111,109,101 | 5 

'cos | 39,99,111,115 | 4 

'course | 39,99,111,117,114,115,101 | 7 

'coz | 39,99,111,122 | 4 

'cuz | 39,99,117,122 | 4 

'enough | 39,101,110,111,117,103,104 | 7 

'fallin | 39,102,97,108,108,105,110 | 7 

'game | 39,103,97,109,101 | 5 

'have | 39,104,97,118,101 | 5 

'he | 39,104,101 | 3 

'hearts | 39,104,101,97,114,116,115 | 7 

'hes | 39,104,101,115 | 4 

'how | 39,104,111,119 | 4 

'i | 39,105 | 2 

'i'm | 39,105,39,109 | 4 

'im | 39,105,109 | 3 

'john | 39,106,111,104,110 | 5 

'k.now | 39,107,46,110,111,119 | 6 

'maybe | 39,109,97,121,98,101 | 6 

'my | 39,109,121 | 3 

'night | 39,110,105,103,104,116 | 6 

'nuff | 39,110,117,102,102 | 5 

'oh | 39,111,104 | 3 

'r | 39,114 | 2 

'relapse | 39,114,101,108,97,112,115,101 | 8 

'sides | 39,115,105,100,101,115 | 6 

'sigh | 39,115,105,103,104 | 5 

'sign',be | 39,115,105,103,110,39,44,98,101 | 9 

'sorry | 39,115,111,114,114,121 | 6 

'sup | 39,115,117,112 | 4 

't | 39,116 | 2 

'tain | 39,116,97,105,110 | 5 

'technological | 39,116,101,99,104,110,111,108,111,103,105,99,97,108 | 14 

'tell | 39,116,101,108,108 | 5 

'the | 39,116,104,101 | 4 

'till | 39,116,105,108,108 | 5 

'timely | 39,116,105,109,101,108,121 | 7 

'tis | 39,116,105,115 | 4 

'to | 39,116,111 | 3 

'twas | 39,116,119,97,115 | 5 

'twillst | 39,116,119,105,108,108,115,116 | 8 

'u | 39,117 | 2 

'wannabe | 39,119,97,110,110,97,98,101 | 8 

'whose | 39,119,104,111,115,101 | 6 

'you | 39,121,111,117 | 4 

'âª | 39,226,153,170 | 4 

( | 40 | 1 

(&amp | 40,38,97,109,112 | 5 

(((hugs | 40,40,40,104,117,103,115 | 7 

((hope | 40,40,104,111,112,101 | 6 

((hugs | 40,40,104,117,103,115 | 6 

((me | 40,40,109,101 | 4 

((miss | 40,40,109,105,115,115 | 6 

(*thinks*) | 40,42,116,104,105,110,107,115,42,41 | 10 

(-: | 40,45,58 | 3 

(-; | 40,45,59 | 3 

(-_-) | 40,45,95,45,41 | 5 

(1 | 40,49 | 2 

(2 | 40,50 | 2 

(7 | 40,55 | 2 

(8 | 40,56 | 2 

(8).so | 40,56,41,46,115,111 | 6 

(: | 40,58 | 2 

(:p | 40,58,112 | 3 

(; | 40,59 | 2 

(= | 40,61 | 2 

(@supachiinga | 40,64,115,117,112,97,99,104,105,105,110,103,97 | 13 

(a | 40,97 | 2 

(again | 40,97,103,97,105,110 | 6 

(agree | 40,97,103,114,101,101 | 6 

(albeit | 40,97,108,98,101,105,116 | 7 

(also | 40,97,108,115,111 | 5 

(and | 40,97,110,100 | 4 

(annoyed)ang | 40,97,110,110,111,121,101,100,41,97,110,103 | 12 

(be | 40,98,101 | 3 

(better | 40,98,101,116,116,101,114 | 7 

(bonus | 40,98,111,110,117,115 | 6 

(bowing | 40,98,111,119,105,110,103 | 7 

(btw | 40,98,116,119 | 4 

(ca | 40,99,97 | 3 

(cant | 40,99,97,110,116 | 5 

(captain | 40,99,97,112,116,97,105,110 | 8 

(con't | 40,99,111,110,39,116 | 6 

(cont | 40,99,111,110,116 | 5 

(cost | 40,99,111,115,116 | 5 

(crickets) | 40,99,114,105,99,107,101,116,115,41 | 10 

(damn | 40,100,97,109,110 | 5 

(darn | 40,100,97,114,110 | 5 

(dewaynethompson | 40,100,101,119,97,121,110,101,116,104,111,109,112,115,111,110 | 16 

(do | 40,100,111 | 3 

(dreamssss | 40,100,114,101,97,109,115,115,115,115 | 10 

(envious | 40,101,110,118,105,111,117,115 | 8 

(esp | 40,101,115,112 | 4 

(even | 40,101,118,101,110 | 5 

(farts | 40,102,97,114,116,115 | 6 

(finish).hvnt | 40,102,105,110,105,115,104,41,46,104,118,110,116 | 13 

(for | 40,102,111,114 | 4 

(free | 40,102,114,101,101 | 5 

(good | 40,103,111,111,100 | 5 

(grumbling | 40,103,114,117,109,98,108,105,110,103 | 10 

(guess | 40,103,117,101,115,115 | 6 

(happy | 40,104,97,112,112,121 | 6 

(haven't | 40,104,97,118,101,110,39,116 | 8 

(he | 40,104,101 | 3 

(heh | 40,104,101,104 | 4 

(hehe | 40,104,101,104,101 | 5 

(him | 40,104,105,109 | 4 

(hopefully | 40,104,111,112,101,102,117,108,108,121 | 10 

(how | 40,104,111,119 | 4 

(hugs | 40,104,117,103,115 | 5 

(hungry | 40,104,117,110,103,114,121 | 7 

(hurts | 40,104,117,114,116,115 | 6 

(i | 40,105 | 2 

(i'm | 40,105,39,109 | 4 

(if | 40,105,102 | 3 

(ii | 40,105,105 | 3 

(im | 40,105,109 | 3 

(imu | 40,105,109,117 | 4 

(inhale | 40,105,110,104,97,108,101 | 7 

(internet | 40,105,110,116,101,114,110,101,116 | 9 

(jk | 40,106,107 | 3 

(just | 40,106,117,115,116 | 5 

(kissandmakeup | 40,107,105,115,115,97,110,100,109,97,107,101,117,112 | 14 

(l | 40,108 | 2 

(lol | 40,108,111,108 | 4 

(makes | 40,109,97,107,101,115 | 6 

(maybe | 40,109,97,121,98,101 | 6 

(moi= | 40,109,111,105,61 | 5 

(my | 40,109,121 | 3 

(n | 40,110 | 2 

(nights | 40,110,105,103,104,116,115 | 7 

(no | 40,110,111 | 3 

(not | 40,110,111,116 | 4 

(oh | 40,111,104 | 3 

(ok | 40,111,107 | 3 

(one | 40,111,110,101 | 4 

(or | 40,111,114 | 3 

(p.s | 40,112,46,115 | 4 

(pasingot | 40,112,97,115,105,110,103,111,116 | 9 

(pause | 40,112,97,117,115,101 | 6 

(please | 40,112,108,101,97,115,101 | 7 

(praying | 40,112,114,97,121,105,110,103 | 8 

(ps | 40,112,115 | 3 

(ref | 40,114,101,102 | 4

Explore training metrics¶

from validators.url import url as validate_url

validate_url('glamourkills.com')

ValidationFailure(func=url, args={'value': 'glamourkills.com', 'public': False})

df = pd.read_csv('./metrics/training.csv')
df.head()

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-5-30480f2a5022> in <module>
----> 1 df = pd.read_csv('./metrics/training.csv')
      2 df.head()

/usr/local/lib/python3.6/dist-packages/pandas/io/parsers.py in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision)
    684     )
    685 
--> 686     return _read(filepath_or_buffer, kwds)
    687 
    688 

/usr/local/lib/python3.6/dist-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
    450 
    451     # Create the parser.
--> 452     parser = TextFileReader(fp_or_buf, **kwds)
    453 
    454     if chunksize or iterator:

/usr/local/lib/python3.6/dist-packages/pandas/io/parsers.py in __init__(self, f, engine, **kwds)
    934             self.options["has_index_names"] = kwds["has_index_names"]
    935 
--> 936         self._make_engine(self.engine)
    937 
    938     def close(self):

/usr/local/lib/python3.6/dist-packages/pandas/io/parsers.py in _make_engine(self, engine)
   1166     def _make_engine(self, engine="c"):
   1167         if engine == "c":
-> 1168             self._engine = CParserWrapper(self.f, **self.options)
   1169         else:
   1170             if engine == "python":

/usr/local/lib/python3.6/dist-packages/pandas/io/parsers.py in __init__(self, src, **kwds)
   1996         kwds["usecols"] = self.usecols
   1997 
-> 1998         self._reader = parsers.TextReader(src, **kwds)
   1999         self.unnamed_cols = self._reader.unnamed_cols
   2000 

pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader.__cinit__()

pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._setup_parser_source()

FileNotFoundError: [Errno 2] No such file or directory: './metrics/training.csv'

df[['epoch', 'accuracy', 'val_accuracy']].iplot(
    x='epoch',
    mode='lines+markers',
    xTitle='epoch',
    yTitle='accuracy', 
    title='Training accuracy',
    linecolor='black',
)

df[['epoch', 'loss', 'val_loss']].iplot(
    x='epoch',
    mode='lines+markers',
    xTitle='epoch',
    yTitle='accuracy', 
    title='Losses'
)

Predictions¶

Load probability model¶

which can give predictions on model classes

0 - bad review, 1 - good revie

from src.predict import get_probability_model

model = get_probability_model()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
bidirectional (Bidirectional (None, None, 128)         186880    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 64)                41216     
_________________________________________________________________
dense (Dense)                (None, 64)                4160      
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 130       
=================================================================
Total params: 232,386
Trainable params: 232,386
Non-trainable params: 0
_________________________________________________________________

Firstly will try predict on some data from training dataset

from src.predict import get_text_and_label_from_dataset, predict
REVIEW_INDEX = 110

text, real_label = get_text_and_label_from_dataset(REVIEW_INDEX)

print('text for prediction\n\n', text, '\n')

predicted_label, predictions = predict(text, model)

print(label_categories[predicted_label], 'review')

print('\n\nPredicted label:', predicted_label, 'real label: ', real_label, 'predictions:', predictions)
if (predicted_label == real_label):
    print('Successfully predicted')
else:
    print('Failed to predict')

text for prediction

 Dakota (1988) was another early Lou Diamond Phillips starring vehicle. This film is similar to the later released film Harley. There are a few differences but they're both the same. I don't know which one came first. I guess it'll remain one of the mysteries of life. But they both are troubled "kids" who are trying to turn there lives around. Instead of bikes this one involves horses. They're basically the same movie and they're both cheesy as hell. If you're a serious L.D.P. fan then I recommend that you watch them both. You get some extreme mugging and posturing from L.D.P. if you're game then go for it.<br /><br />Not recommended, except for L.D.P. fans!!! 

bad review


Predicted label: 0 real label:  0 predictions: [0.70361704 0.29638302]
Successfully predicted

Then will try predict hadnwritten text

# Can change text and check model
hadwriten = 'This is good film'

print('Hendwriten text:\n', hadwriten, '\n')

handwriten_label, predictions = predict(hadwriten, model)

print(label_categories[predicted_label], 'review')

print('Probabilities', predictions)

Hendwriten text:
 This is good film 

bad review
Probabilities [0.3463477 0.6536523]

	text	type
0	b"@switchfoot http://twitpic.com/2y1zl - Awww,...	bad
1	b"is upset that he can't update his Facebook b...	bad
2	b'@Kenichan I dived many times for the ball. M...	bad
3	b'my whole body feels itchy and like its on fi...	bad
4	b"@nationwideclass no, it's not behaving at al...	bad

	text	type
30	b'need suggestions for a good IR filter for my...	neutral
31	b'@surfit: I just checked my google for my bus...	neutral
32	b"@phyreman9 Google is always a good place to ...	good
33	b'Played with an android google phone. The sli...	bad
34	b'US planning to resume the military tribunals...	bad
35	b'omg so bored & my tattoooos are so itchy...	bad
36	b"I'm itchy and miserable!"	bad
37	b"@sekseemess no. I'm not itchy for now. Maybe...	bad
38	b'RT @jessverr I love the nerdy Stanford human...	good
39	b'@spinuzzi: Has been a bit crazy, with steep ...	good

	text	type
0	b'username link - www bummer . you should get ...	bad
1	b'upset he can not update his Facebook texting...	bad
2	b'username i dive time ball . manage save ## r...	bad
3	b'my body feel itchy and like fire'	bad
4	b'username no it not behave at all . i mad . w...	bad

	text	type
0	b'username i loooooooovvvvvveee my Kindle # . ...	good
1	b'read my kindle # ... love it ... Lee childs ...	good
2	b'okay \xc3\xaf\xc2\xbb\xc2\xbf # assessment k...	good
3	b'username you will love Kindle # . i month an...	good
4	b'username Fair enough . but i Kindle # and i ...	good