# plotly standard imports
import plotly.graph_objs as go
import chart_studio.plotly as py
# Cufflinks wrapper on plotly
import cufflinks
# Data science imports
import pandas as pd
import numpy as np
# Options for pandas
pd.options.display.max_columns = 30
# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
from plotly.offline import iplot, init_notebook_mode
cufflinks.go_offline(connected=True)
init_notebook_mode(connected=True)
# Set global theme
cufflinks.set_config_file(world_readable=True, theme='pearl')
import tensorflow_datasets as tfds
from src.datasets import download
train_data, test_data = download(display_train_progress=True)
Labels can be 0, 0.5, 1. From bad to good sentimen.
Will map them to correct words for easier exploring
label_categories = ['bad', 'neutral', 'good']
def humanize_label(x):
return label_categories[int(x * 2)]
train_df = pd.DataFrame(tfds.as_numpy(train_data), columns=['text', 'type'])
train_df['type'] = train_df['type'].apply(humanize_label)
train_df.head()
print('Training dataset records', len(train_df.index))
train_df['type'].iplot(
kind='hist',
yTitle='count',
xTitle='Type',
title='Training data distribution'
)