import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import sklearn.linear_model
import sklearn.metrics
import sklearn.preprocessing
import sklearn.pipeline
import sklearn.datasets

import warnings
warnings.filterwarnings("ignore")

sns.set()


df = pd.read_csv("../Project_2/walmart_stock.csv")

num_rows, num_cols = df.shape

print("Number of rows: ", num_rows)
print("Number of columns: ", num_cols)

Number of rows:  2829
Number of columns:  6


print(df.head(),"\n")

summary = df.describe()

print(summary)

       Date     Close      Open      High       Low      Volume
0  3/1/2012   $60.33    $59.97    $61.06    $59.87   12,668,774
1  4/1/2012   $59.71    $60.21    $60.35    $59.47    9,593,915
2  5/1/2012   $59.42    $59.35    $59.62    $58.37   12,768,202
3  6/1/2012   $59.00    $59.42    $59.45    $58.87    8,069,504
4  9/1/2012   $59.18    $59.03    $59.55    $58.92    6,679,713 

            Date     Close      Open      High       Low      Volume
count       2829      2829      2829      2829      2829        2829
unique      2829      2279      2240      2262      2257        2829
top     3/1/2012   $73.51    $74.84    $69.24    $74.51   12,668,774
freq           1         5         5         6         6           1


data_types = df.dtypes

print(data_types)

Date      object
Close     object
Open      object
High      object
Low       object
Volume    object
dtype: object


# Convert the 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'])

# Remove dollar sign and extra spaces from 'Close' column
df['Close'] = df['Close'].str.replace('$', '').str.strip()
df['Open'] = df['Open'].str.replace('$', '').str.strip()
df['High'] = df['High'].str.replace('$', '').str.strip()
df['Low'] = df['Low'].str.replace('$', '').str.strip()

# Remove commas from 'Volume' column
df['Volume'] = df['Volume'].str.replace(',', '')

# Convert the other columns to float format
df['Close'] = df['Close'].astype(float)
df['Open'] = df['Open'].astype(float)
df['High'] = df['High'].astype(float)
df['Low'] = df['Low'].astype(float)
df['Volume'] = df['Volume'].astype(float)

# Check the data types of the columns
print(df.dtypes)

Date      datetime64[ns]
Close            float64
Open             float64
High             float64
Low              float64
Volume           float64
dtype: object


df.head()


%matplotlib inline

# Set the figure size to (10, 5)
plt.figure(figsize=(10, 5))

# Create a scatter plot
plt.scatter(x=df["Date"], y=df["Close"])

# Add labels to the x and y axis
plt.xlabel("Date")
plt.ylabel("Close")
plt.title("Walmart Daily Stock Price by Year")

# Show the plot
plt.show()


from sklearn.linear_model import LinearRegression
model = LinearRegression(fit_intercept=True)
model

LinearRegression()

LinearRegression()


x=df["Date"] 
y=df["Close"]

print("Shape of x: {}".format(x.shape))
print("Shape of y: {}".format(y.shape))

Shape of x: (2829,)
Shape of y: (2829,)


x = x[:,np.newaxis]
print("Shape of x: {}".format(x.shape))
print(x[:5])

Shape of x: (2829, 1)
[['2012-03-01T00:00:00.000000000']
 ['2012-04-01T00:00:00.000000000']
 ['2012-05-01T00:00:00.000000000']
 ['2012-06-01T00:00:00.000000000']
 ['2012-09-01T00:00:00.000000000']]


model.fit(x, y)

print(model.intercept_)
print(model.coef_)

-281.12770289450333
[2.51298187e-16]


yfit = model.predict(x.astype(float))

plt.scatter(x, y);
plt.plot(x, yfit, color='red');


from sklearn.metrics import r2_score
r2_score(y_true=y, y_pred=yfit)

0.7885782082995971


def adj_r2(ytrue, ypred, N, p):
    return 1- ((1-r2_score(ytrue, ypred))*(N - 1))/(N - p - 1)

adj_r2(y, yfit, N=len(y),p=x.shape[1])

0.7885034216735977


import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Load the data into a pandas DataFrame
data = pd.read_csv("../Project_2/walmart_stock.csv")

# Remove dollar sign and extra spaces from 'Close' column
data['Close'] = data['Close'].str.replace('$', '').str.strip()

# Convert the other columns to float format
data['Close'] = data['Close'].astype(float)

# Convert the 'Date' column to a datetime format
data['new_Date'] = pd.to_datetime(data['Date'])

# Set the 'Date' column as the index
data.set_index('new_Date', inplace=True)

# Split the data into training and testing sets
X = data.index.astype(int).values.reshape(-1,1)
y = data['Close'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a linear regression model
reg = LinearRegression()

# Fit the model to the training data
reg.fit(X_train, y_train)

# Predict the test data using the model
y_pred = reg.predict(X_test)

# Evaluate the model using the R-squared value
r2_score = reg.score(X_test, y_test)
print('R-squared:', r2_score)

#Input a future date to predict the stock price
future_date = '2023-04-10'
future_date_int = pd.to_datetime(future_date).value

#Predict the stock price using the model
future_price = reg.predict([[future_date_int]])

print(f"The predicted stock price for {future_date} is ${future_price[0]:,.2f}")

R-squared: 0.786880531024917
The predicted stock price for 2023-04-10 is $141.20


import matplotlib.pyplot as plt

# Create a scatter plot of the test data and predicted values
plt.scatter(X_test, y_test, color='black')
plt.plot(X_test, y_pred, color='blue', linewidth=3)

# Add labels and title to the plot
plt.xlabel('Date')
plt.ylabel('Close Price')
plt.title('Linear Regression Model')

# Show the plot
plt.show()

Project 2 - Machine Learning Stock Price Prediction¶

Time Series Analysis¶

	Date	Close	Open	High	Low	Volume
0	2012-03-01	60.33	59.97	61.06	59.87	12668774.0
1	2012-04-01	59.71	60.21	60.35	59.47	9593915.0
2	2012-05-01	59.42	59.35	59.62	58.37	12768202.0
3	2012-06-01	59.00	59.42	59.45	58.87	8069504.0
4	2012-09-01	59.18	59.03	59.55	58.92	6679713.0