In [49]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import ipywidgets as widgets
from IPython.display import display
In [50]:
crop = pd.read_csv('yield_df.csv')
In [51]:
crop.shape
Out[51]:
(28242, 8)
In [52]:
crop.describe
Out[52]:
<bound method NDFrame.describe of        Unnamed: 0      Area            Item  Year  hg/ha_yield  \
0               0   Albania           Maize  1990        36613   
1               1   Albania        Potatoes  1990        66667   
2               2   Albania     Rice, paddy  1990        23333   
3               3   Albania         Sorghum  1990        12500   
4               4   Albania        Soybeans  1990         7000   
...           ...       ...             ...   ...          ...   
28237       28237  Zimbabwe     Rice, paddy  2013        22581   
28238       28238  Zimbabwe         Sorghum  2013         3066   
28239       28239  Zimbabwe        Soybeans  2013        13142   
28240       28240  Zimbabwe  Sweet potatoes  2013        22222   
28241       28241  Zimbabwe           Wheat  2013        22888   

       average_rain_fall_mm_per_year  pesticides_tonnes  avg_temp  
0                             1485.0             121.00     16.37  
1                             1485.0             121.00     16.37  
2                             1485.0             121.00     16.37  
3                             1485.0             121.00     16.37  
4                             1485.0             121.00     16.37  
...                              ...                ...       ...  
28237                          657.0            2550.07     19.76  
28238                          657.0            2550.07     19.76  
28239                          657.0            2550.07     19.76  
28240                          657.0            2550.07     19.76  
28241                          657.0            2550.07     19.76  

[28242 rows x 8 columns]>
In [53]:
crop.columns.tolist()
Out[53]:
['Unnamed: 0',
 'Area',
 'Item',
 'Year',
 'hg/ha_yield',
 'average_rain_fall_mm_per_year',
 'pesticides_tonnes',
 'avg_temp']
In [54]:
crop.drop('Unnamed: 0', axis=1, inplace=True)
In [55]:
crop.isnull().sum()
Out[55]:
Area                             0
Item                             0
Year                             0
hg/ha_yield                      0
average_rain_fall_mm_per_year    0
pesticides_tonnes                0
avg_temp                         0
dtype: int64
In [56]:
crop.head(5)
Out[56]:
Area Item Year hg/ha_yield average_rain_fall_mm_per_year pesticides_tonnes avg_temp
0 Albania Maize 1990 36613 1485.0 121.0 16.37
1 Albania Potatoes 1990 66667 1485.0 121.0 16.37
2 Albania Rice, paddy 1990 23333 1485.0 121.0 16.37
3 Albania Sorghum 1990 12500 1485.0 121.0 16.37
4 Albania Soybeans 1990 7000 1485.0 121.0 16.37
In [57]:
crop['Year'].max()
Out[57]:
2013
In [58]:
area_dropdown = widgets.Dropdown(options=crop['Area'].unique(),description='Area:')
item_dropdown = widgets.Dropdown(options=crop['Item'].unique(),description='Item:')

def plotdata(area, item):
    plt.figure(figsize=(10,6))
    temp_crop = crop[(crop['Area'] == area) & (crop['Item'] == item)]
    plt.plot(temp_crop['Year'], temp_crop['hg/ha_yield'])
    plt.title(f'Yield(HG/HA) by year for {item} in {area}')
    plt.xlabel('Year')
    plt.ylabel("Yield(HG/HA)")
    plt.grid(True)
    plt.show()
    
widgets.interactive(plotdata, area=area_dropdown, item=item_dropdown)
In [59]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, confusion_matrix
In [60]:
X = crop.drop('hg/ha_yield', axis=1)
y = crop['hg/ha_yield']
In [61]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=42)
In [62]:
categorical_features = ['Area', 'Item']
numerical_features = ['Year', 'average_rain_fall_mm_per_year', 'pesticides_tonnes', 'avg_temp']
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)])

models = [
    ('Linear Regression', LinearRegression()),
    ('Decision Tree', DecisionTreeRegressor(random_state=42)),
    ('Random Forest', RandomForestRegressor(random_state=42)),
    ('Support Vector Machine', SVR())
]

for name, model, in models:
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f'{name}:')
    print(f'Mean Squared Error: {mse}')
    print(f'Mean Absolute Error: {mae}')
    print(f'R^2 Score: {r2}\n')
Linear Regression:
Mean Squared Error: 1755478293.3023083
Mean Absolute Error: 29382.759649212712
R^2 Score: 0.7597634459557141

Decision Tree:
Mean Squared Error: 150410177.4015012
Mean Absolute Error: 3746.1767455034696
R^2 Score: 0.9794164229486694

Random Forest:
Mean Squared Error: 93029217.64155686
Mean Absolute Error: 3561.606054383232
R^2 Score: 0.9872689860325178

Support Vector Machine:
Mean Squared Error: 8811737554.637598
Mean Absolute Error: 57552.20392982162
R^2 Score: -0.20588301965646139

In [ ]: