Notes on Creating Data Science Apps With Streamlit
- Overview
- Streamlit
- Simple Stock Price
- Simple Bioinformatics DNA Count
- EDA Basketball
- EDA Cryptocurrency
- Classification Iris Data
- Regression Boston Housing Data
- Deploy App to Heroku
- Deploy App to Streamlit Sharing
Overview
Here are some notes I took while watching Chanin Nantasenamat’s video on creating data science web apps with Streamlit.
Streamlit
Turns data scripts into shareable web apps
pip install streamlit
Test Installation:
streamlit hello
Run apps:
streamlit run main.py
Format text using Markdown
Simple Stock Price
Get market data from Yahoo! Finance API
Dependencies
replit: Simple_Stock_Price
import yfinance as yf
import streamlit as st
import pandas as pd
# Write text in Markdown format
"""
st.write(# Simple Stock Price App
Shown are the stock closing price and volume of iPath Global Carbon ETN!
""")
# https://towardsdatascience.com/how-to-get-stock-data-using-python-c0de1df17e75
# define the ticker symbol
= 'GRN'
tickerSymbol # get data on this ticker
= yf.Ticker(tickerSymbol)
tickerData
# get the historical pricess for this ticker
# Open High Low Close Volume Dividends Stock Splits
= tickerData.history(period='1d', start='2019-12-27', end='2021-12-27')
tickerDf
# Create streamlit line charts
"""
st.write(## Closing Price
""")
st.line_chart(tickerDf.Close)"""
st.write(## Trading Volume
""")
st.line_chart(tickerDf.Volume)
Simple Bioinformatics DNA Count
Count the number of nucleotides 'A', 'T', 'G', 'C'
in entered in a text box
Dependencies
- Pandas
pip install pandas
- Streamlit
pip install streamlit
- Altair
pip install altair
- Pillow
pip install pillow
replit: Simple_Bioinformatics_DNA_Count
# Import dependencies
import pandas as pd
import streamlit as st
import altair as alt
from PIL import Image
# Page Title
# Add hero image
= Image.open('dna-ge3ed05159_1920.jpg')
image =True)
st.image(image, use_column_width
"""
st.write(# DNA Nucleotide Count Web App
This app counts the nucleotide composition of query DNA!
***
""")
# Input Text Box
#st.sidebar.header('Enter DNA sequence')
'Enter DNA sequence')
st.header(
= ">DNA Query\nGAACACGTGGAGGCAAACAGGAAGGTGAAGAAGAACTTATCCTATCAGGACGGAAGGTCCTGTGCTCGGG\nATCTTCCAGACGTCGCGACTCTAAATTGCCCCCTCTGAGGTCAAGGAACACAAGATGGTTTTGGAAATGC\nTGAACCCGATACATTATAACATCACCAGCATCGTGCCTGAAGCCATGCCTGCTGCCACCATGCCAGTCCT"
sequence_input
= st.text_area("Sequence input", sequence_input, height=250)
sequence # Split input text by line
= sequence.splitlines()
sequence # Skip the sequence name (first line)
= sequence[1:]
sequence # Concatenate list to string
= ''.join(sequence)
sequence
"""
st.write(***
""")
# Print the input DNA sequence
'INPUT (DNA Query)')
st.header(
sequence
# DNA nucleotide count
'OUTPUT (DNA Nucleotide Count)')
st.header(
# 1. Print dictionary
'1. Prince dictionary')
st.subheader(def DNA_nucleotide_count(seq):
= dict([
d 'A', seq.count('A')),
('T', seq.count('T')),
('G', seq.count('G')),
('C', seq.count('C'))
(
])return d
= DNA_nucleotide_count(sequence)
X
X
# 2. Print text
'2. Print text')
st.subheader('There are ' + str(X['A']) + ' adenine (A)')
st.write('There are ' + str(X['T']) + ' thymine (T)')
st.write('There are ' + str(X['G']) + ' guanine (G)')
st.write('There are ' + str(X['C']) + ' cytosine (C)')
st.write(
# 3. Display DataFrame
'3. Display DataFrame')
st.subheader(= pd.DataFrame.from_dict(X, orient='index')
df = df.rename({0: 'count'}, axis='columns')
df =True)
df.reset_index(inplace= df.rename(columns={'index': 'nucleotide'})
df
st.write(df)
# 4. Display Bar Chart using Altair
'4. Display Bar chart')
st.subheader(= alt.Chart(df).mark_bar().encode(
p ='nucleotide',
x='count'
y
)
= p.properties(
p # Controls width of bar
=alt.Step(80)
width
) st.write(p)
EDA Basketball
Scrape NBA player stats from a website and perform exploratory data analysis.
Dependencies
- Pandas
pip install pandas
- Streamlit
pip install streamlit
- Matplotlib
pip install matplotlib
- Seaborn
pip install seaborn
- Numpy
pip install numpy
- lxml
pip install lxml
Data Source
Basketball Statistics and History
replit: EDA_Basketball
import streamlit as st
import pandas as pd
import base64
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
'NBA Player Stats Explorer')
st.title(
"""
st.markdown(This app performs simple websraping of NBA player stats data!
* **Python libraries:** base64, pandas, streamlit
* **Data source:** [Basketball-reference.com](https://www.basketball-reference.com/)
""")
'User Input Features')
st.sidebar.header(= st.sidebar.selectbox('Year', list(reversed(range(1950,2020))))
selected_year
# Web scraping of NBA player stats
@st.cache
def load_data(year):
= f'https://www.basketball-reference.com/leagues/NBA_{year}_per_game.html'
url = pd.read_html(url, header=0)
html = html[0]
df # Delete repeating headers
= df.drop(df[df.Age == 'Age'].index)
raw # Fill missing data with 0
= raw.fillna(0)
raw # Convert int columns to float
'FG%'] = raw['FG%'].astype(float)
raw['3P%'] = raw['3P%'].astype(float)
raw['2P%'] = raw['2P%'].astype(float)
raw['eFG%'] = raw['eFG%'].astype(float)
raw['FT%'] = raw['FT%'].astype(float)
raw[# Remove redundant index column
= raw.drop(['Rk'], axis=1)
playerstats return playerstats
= load_data(selected_year)
playerstats
# sidebar - Team selection
= sorted(playerstats.Tm.unique())
sorted_unique_team = st.sidebar.multiselect('Team', sorted_unique_team, sorted_unique_team)
selected_team
# Sidebar - Position selection
= ['C', 'PF', 'SF', 'PG', 'SG']
unique_pos = st.sidebar.multiselect('Position', unique_pos, unique_pos)
selected_pos
# Filtering data
= playerstats[(playerstats.Tm.isin(selected_team)) & (playerstats.Pos.isin(selected_pos))]
df_selected_team
'Display Player Stats of Selected Team(s)')
st.header('Data Dimension: ' + str(df_selected_team.shape[0]) + ' rows and ' + str(df_selected_team.shape[1]) + ' columns.')
st.write(
st.dataframe(df_selected_team)
# Download NBA player stats data
# https://discuss.streamlit.io/t/how-to-download-file-in-streamlit/1806
def filedownload(df):
= df.to_csv(index=False)
csv # strings <-> bytes conversion
= base64.b64encode(csv.encode()).decode()
b64 = f'<a href="data:file/csv;base64,{b64}" download="playerstats.csv">Download CSV File</a>'
href return href
=True)
st.markdown(filedownload(df_selected_team), unsafe_allow_html
# Heatmap
if st.button('Intercorrelation Heatmap'):
'Intercorrelation Matrix Heatmap')
st.header('output.csv', index=False)
df_selected_team.to_csv(= pd.read_csv('output.csv')
df
= df.corr()
corr = np.zeros_like(corr)
mask = True
mask[np.triu_indices_from(mask)] = None
fig with sns.axes_style("white"):
= plt.subplots(figsize=(7,5))
fig, ax = sns.heatmap(corr, mask=mask, vmax=1, square=True)
ax st.pyplot(fig)
EDA Cryptocurrency
Use the BeautifulSoup library to scrape data from CoinMarketCap and perform exploratory data analysis.
Dependencies
- Pandas
pip install pandas
- Streamlit
pip install streamlit
- Matplotlib
pip install matplotlib
- lxml
pip install lxml
- BeautifulSoup
pip install beautifulsoup4
replit: EDA_Cryptocurrency
# This app is for educational purpose only. Insights gained is not financial advice. Use at your own risk!
import streamlit as st
from PIL import Image
import pandas as pd
import base64
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import requests
import json
import time
# ---------------------------------#
# New feature (make sure to upgrade your streamlit library)
# pip install --upgrade streamlit
# ---------------------------------#
# Page layout
# Page expands to full width
="wide")
st.set_page_config(layout# ---------------------------------#
# Title
= Image.open("pexels-worldspectrum-844124.jpg")
image
=500)
st.image(image, width
"Crypto Price App")
st.title(
st.markdown("""
This app retrieves cryptocurrency prices for the top 100 cryptocurrency from the **CoinMarketCap**!
"""
)# ---------------------------------#
# About
= st.expander("About")
expander_bar
expander_bar.markdown("""
* **Python libraries:** base64, pandas, streamlit, numpy, matplotlib, seaborn, BeautifulSoup, requests, json, time
* **Data source:** [CoinMarketCap](http://coinmarketcap.com).
* **Credit:** Web scraper adapted from the Medium article *[Web Scraping Crypto Prices With Python](https://towardsdatascience.com/web-scraping-crypto-prices-with-python-41072ea5b5bf)* written by [Bryan Feng](https://medium.com/@bryanf).
"""
)
# ---------------------------------#
# Page layout (continued)
# Divide page to 3 columns (col1 = sidebar, col2 and col3 = page contents)
= st.sidebar
col1 = st.columns((2, 1))
col2, col3
# ---------------------------------#
# Sidebar + Main panel
"Input Options")
col1.header(
# Sidebar - Currency price unit
= col1.selectbox("Select currency for price", ("USD", "BTC", "ETH"))
currency_price_unit
# Web scraping of CoinMarketCap data
@st.cache
def load_data():
= requests.get("https://coinmarketcap.com")
cmc = BeautifulSoup(cmc.content, "html.parser")
soup
= soup.find("script", id="__NEXT_DATA__", type="application/json")
data = {}
coins = json.loads(data.contents[0])
coin_data = coin_data["props"]["initialState"]["cryptocurrency"]["listingLatest"][
listings "data"
]
= listings[0]["keysArr"]
attributes = attributes.index("id")
index_of_id = attributes.index("slug")
index_of_slug
for i in listings[1:]:
str(i[index_of_id])] = i[index_of_slug]
coins[
= []
coin_name = []
coin_symbol = []
market_cap = []
percent_change_1h = []
percent_change_24h = []
percent_change_7d = []
price = []
volume_24h
= attributes.index("slug")
index_of_slug = attributes.index("symbol")
index_of_symbol
= attributes.index(
index_of_quote_currency_price f"quote.{currency_price_unit}.price"
)= attributes.index(
index_of_quote_currency_percent_change_1h f"quote.{currency_price_unit}.percentChange1h"
)= attributes.index(
index_of_quote_currency_percent_change_24h f"quote.{currency_price_unit}.percentChange24h"
)= attributes.index(
index_of_quote_currency_percent_change_7d f"quote.{currency_price_unit}.percentChange7d"
)= attributes.index(
index_of_quote_currency_market_cap f"quote.{currency_price_unit}.marketCap"
)= attributes.index(
index_of_quote_currency_volume_24h f"quote.{currency_price_unit}.volume24h"
)
for i in listings[1:]:
coin_name.append(i[index_of_slug])
coin_symbol.append(i[index_of_symbol])
price.append(i[index_of_quote_currency_price])
percent_change_1h.append(i[index_of_quote_currency_percent_change_1h])
percent_change_24h.append(i[index_of_quote_currency_percent_change_24h])
percent_change_7d.append(i[index_of_quote_currency_percent_change_7d])
market_cap.append(i[index_of_quote_currency_market_cap])
volume_24h.append(i[index_of_quote_currency_volume_24h])
= pd.DataFrame(
df =[
columns"coin_name",
"coin_symbol",
"market_cap",
"percent_change_1h",
"percent_change_24h",
"percent_change_7d",
"price",
"volume_24h",
]
)"coin_name"] = coin_name
df["coin_symbol"] = coin_symbol
df["price"] = price
df["percent_change_1h"] = percent_change_1h
df["percent_change_24h"] = percent_change_24h
df["percent_change_7d"] = percent_change_7d
df["market_cap"] = market_cap
df["volume_24h"] = volume_24h
df[return df
= load_data()
df
# Sidebar - Cryptocurrency selections
= sorted(df["coin_symbol"])
sorted_coin = col1.multiselect("Cryptocurrency", sorted_coin, sorted_coin)
selected_coin
= df[(df["coin_symbol"].isin(selected_coin))] # Filtering data
df_selected_coin
# Sidebar - Number of coins to display
= col1.slider("Display Top N Coins", 1, 100, 100)
num_coin = df_selected_coin[:num_coin]
df_coins
# Sidebar - Percent change timeframe
= col1.selectbox("Percent change time frame", ["7d", "24h", "1h"])
percent_timeframe = {
percent_dict "7d": "percent_change_7d",
"24h": "percent_change_24h",
"1h": "percent_change_1h",
}= percent_dict[percent_timeframe]
selected_percent_timeframe
# Sidebar - Sorting values
= col1.selectbox("Sort values?", ["Yes", "No"])
sort_values
"Price Data of Selected Cryptocurrency")
col2.subheader(
col2.write("Data Dimension: "
+ str(df_selected_coin.shape[0])
+ " rows and "
+ str(df_selected_coin.shape[1])
+ " columns."
)
col2.dataframe(df_coins)
# Download CSV data
# https://discuss.streamlit.io/t/how-to-download-file-in-streamlit/1806
def filedownload(df):
= df.to_csv(index=False)
csv = base64.b64encode(csv.encode()).decode() # strings <-> bytes conversions
b64 = f'<a href="data:file/csv;base64,{b64}" download="crypto.csv">Download CSV File</a>'
href return href
=True)
col2.markdown(filedownload(df_selected_coin), unsafe_allow_html
# ---------------------------------#
# Preparing data for Bar plot of % Price change
"Table of % Price Change")
col2.subheader(= pd.concat(
df_change
[
df_coins.coin_symbol,
df_coins.percent_change_1h,
df_coins.percent_change_24h,
df_coins.percent_change_7d,
],=1,
axis
)= df_change.set_index("coin_symbol")
df_change "positive_percent_change_1h"] = df_change["percent_change_1h"] > 0
df_change["positive_percent_change_24h"] = df_change["percent_change_24h"] > 0
df_change["positive_percent_change_7d"] = df_change["percent_change_7d"] > 0
df_change[
col2.dataframe(df_change)
# Conditional creation of Bar plot (time frame)
"Bar plot of % Price Change")
col3.subheader(
if percent_timeframe == "7d":
if sort_values == "Yes":
= df_change.sort_values(by=["percent_change_7d"])
df_change "*7 days period*")
col3.write(=(5, 25))
plt.figure(figsize=1, bottom=0)
plt.subplots_adjust(top"percent_change_7d"].plot(
df_change[="barh",
kind=df_change.positive_percent_change_7d.map({True: "g", False: "r"}),
color
)
col3.pyplot(plt)elif percent_timeframe == "24h":
if sort_values == "Yes":
= df_change.sort_values(by=["percent_change_24h"])
df_change "*24 hour period*")
col3.write(=(5, 25))
plt.figure(figsize=1, bottom=0)
plt.subplots_adjust(top"percent_change_24h"].plot(
df_change[="barh",
kind=df_change.positive_percent_change_24h.map({True: "g", False: "r"}),
color
)
col3.pyplot(plt)else:
if sort_values == "Yes":
= df_change.sort_values(by=["percent_change_1h"])
df_change "*1 hour period*")
col3.write(=(5, 25))
plt.figure(figsize=1, bottom=0)
plt.subplots_adjust(top"percent_change_1h"].plot(
df_change[="barh",
kind=df_change.positive_percent_change_1h.map({True: "g", False: "r"}),
color
) col3.pyplot(plt)
Classification Iris Data
Use scikit-learn to perform classification with a Random Forest Classifier.
Dependencies
- Pandas
pip install pandas
- Streamlit
pip install streamlit
- Scikit learn
pip install scikit-learn
replit: Classification_Iris_Data
import streamlit as st
import pandas as pd
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
"""
st.write(# Simple Iris Flower Prediction App
This app predicts the **Iris flower** typ:
""")
"User Input Parameters")
st.sidebar.header(
def user_input_features():
= st.sidebar.slider('Sepal length', 4.3, 7.9, 5.4)
sepal_length = st.sidebar.slider('Sepal width', 2.0, 4.4, 3.4)
sepal_width = st.sidebar.slider('Petal length', 1.0, 6.9, 1.3)
petal_length = st.sidebar.slider('Petal width', 0.1, 2.5, 0.2)
petal_width = {'sepal_length': sepal_length,
data 'sepal_width': sepal_width,
'petal_length': petal_length,
'petal_width': petal_width}
= pd.DataFrame(data, index=[0])
features return features
= user_input_features()
df
'User Input parameters')
st.subheader(
st.write(df)
= datasets.load_iris()
iris = iris.data
X = iris.target
Y
= RandomForestClassifier()
clf
clf.fit(X, Y)
= clf.predict(df)
prediction = clf.predict_proba(df)
prediction_proba
'Class labels and their corresponding index number')
st.subheader(
st.write(iris.target_names)
'Prediction')
st.subheader(
st.write(iris.target_names[prediction])#st.write(prediction)
'Prediction Probability')
st.subheader( st.write(prediction_proba)
Regression Boston Housing Data
Use regression to predict housing prices.
Dependencies
- Pandas
pip install pandas
- Streamlit
pip install streamlit
- Scikit learn
pip install scikit-learn
- shap
pip install shap
- Matplotlib
pip install matplotlib
replit: Regression_Boston_Housing_Data
import streamlit as st
import pandas as pd
import shap
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.ensemble import RandomForestRegressor
"""
st.write(# Boston House Price Prediction App
This app predicts the **Boston House Price**!
""")
'---')
st.write(
# Loads the Boston House Price Dataset
= datasets.load_boston()
boston = pd.DataFrame(boston.data, columns=boston.feature_names)
X = pd.DataFrame(boston.target, columns=["MEDV"])
Y
# Sidebar
# Header of Specify Input Parameters
'Specify Input Parameters')
st.sidebar.header(
def user_input_features():
= st.sidebar.slider('CRIM', float(X.CRIM.min()), float(X.CRIM.max()), float(X.CRIM.mean()))
CRIM = st.sidebar.slider('ZN', float(X.ZN.min()), float(X.ZN.max()), float(X.ZN.mean()))
ZN = st.sidebar.slider('INDUS', float(X.INDUS.min()), float(X.INDUS.max()), float(X.INDUS.mean()))
INDUS = st.sidebar.slider('CHAS', float(X.CHAS.min()), float(X.CHAS.max()), float(X.CHAS.mean()))
CHAS = st.sidebar.slider('NOX', float(X.NOX.min()), float(X.NOX.max()), float(X.NOX.mean()))
NOX = st.sidebar.slider('RM', float(X.RM.min()), float(X.RM.max()), float(X.RM.mean()))
RM = st.sidebar.slider('AGE', float(X.AGE.min()), float(X.AGE.max()), float(X.AGE.mean()))
AGE = st.sidebar.slider('DIS', float(X.DIS.min()), float(X.DIS.max()), float(X.DIS.mean()))
DIS = st.sidebar.slider('RAD', float(X.RAD.min()), float(X.RAD.max()), float(X.RAD.mean()))
RAD = st.sidebar.slider('TAX', float(X.TAX.min()), float(X.TAX.max()), float(X.TAX.mean()))
TAX = st.sidebar.slider('PTRATIO', float(X.PTRATIO.min()), float(X.PTRATIO.max()), float(X.PTRATIO.mean()))
PTRATIO = st.sidebar.slider('B', float(X.B.min()), float(X.B.max()), float(X.B.mean()))
B = st.sidebar.slider('LSTAT', float(X.LSTAT.min()), float(X.LSTAT.max()), float(X.LSTAT.mean()))
LSTAT = {'CRIM': CRIM,
data 'ZN': ZN,
'INDUS': INDUS,
'CHAS': CHAS,
'NOX': NOX,
'RM': RM,
'AGE': AGE,
'DIS': DIS,
'RAD': RAD,
'TAX': TAX,
'PTRATIO': PTRATIO,
'B': B,
'LSTAT': LSTAT}
= pd.DataFrame(data, index=[0])
features return features
= user_input_features()
df
# Main Panel
# Print specified input parameters
'Specified Input parameters')
st.header(
st.write(df)'---')
st.write(
# Build Regression Model
= RandomForestRegressor()
model
model.fit(X, Y)# Apply Model to Make Prediction
= model.predict(df)
prediction
'Prediction of MEDV')
st.header(
st.write(prediction)'---')
st.write(
# Explaining the model's predictions using SHAP values
# https://github.com/slundberg/shap
= shap.TreeExplainer(model)
explainer = explainer.shap_values(X)
shap_values
= plt.subplots()
fig, ax
'Feature Importance')
st.header('Feature importance based on SHAP values')
plt.title(
shap.summary_plot(shap_values, X)='tight')
st.pyplot(fig, bbox_inches'---')
st.write(
'Feature importance based on SHAP values (Bar)')
plt.title(="bar")
shap.summary_plot(shap_values, X, plot_type='tight') st.pyplot(fig, bbox_inches
Deploy App to Heroku
runtime.txt
- contains the required python version
-3.7.9 python
requirements.txt
- contains the required packages and version numbers
==0.61.0
streamlit==0.25.3
pandas==1.18.1
numpy-learn==0.22.1 scikit
setup.sh
- contains the setup steps for the server on the Heroku dyno
mkdir -p ~/.streamlit/
echo "\
[server]\n\
port = $PORT\n\
enableCORS = false\n\
headless = true\n\
\n\
" > ~/.streamlit/config.toml
Procfile
- runs the
[setup.sh](http://setup.sh)
file and starts the streamlit app
web: sh setup.sh && streamlit run app.py
Deploy App to Streamlit Sharing
requirements.txt
- contains the required packages and version numbers
==0.61.0
streamlit==0.25.3
pandas==1.18.1
numpy-learn==0.22.1 scikit
References:
- I’m Christian Mills, a deep learning consultant specializing in computer vision and practical AI implementations.
- I help clients leverage cutting-edge AI technologies to solve real-world problems.
- Learn more about me or reach out via email at [email protected] to discuss your project.