Upload 9 files
Browse files- app.py +89 -0
- datasets/cleaned_housePrice.csv +0 -0
- datasets/housePrice.csv +0 -0
- models/GradientBoostingRegressor_pipeline.joblib +3 -0
- models/KernelRidge_pipeline.joblib +3 -0
- models/XGBoostRegressor_pipeline.joblib +3 -0
- models/tehran_house_price_preprocessor.joblib +3 -0
- requirements.txt +0 -0
- utils.py +83 -0
app.py
ADDED
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import joblib
|
4 |
+
from sklearn.metrics import r2_score
|
5 |
+
from utils import data_cleaning
|
6 |
+
|
7 |
+
# Load Dataset and Models
|
8 |
+
DATASET_PATH = "C:/Users/Afshin/Desktop/10_Projects/Project_1_Tehran_House_Price_Prediction/datasets/housePrice.csv"
|
9 |
+
MODELS_PATH = [
|
10 |
+
"C:/Users/Afshin/Desktop/10_Projects/Project_1_Tehran_House_Price_Prediction/models/KernelRidge_pipeline.joblib",
|
11 |
+
# "C:/Users/Afshin/Desktop/10_Projects/Project_1_Tehran_House_Price_Prediction/models/GradientBoostingRegressor_pipeline.joblib",
|
12 |
+
"C:/Users/Afshin/Desktop/10_Projects/Project_1_Tehran_House_Price_Prediction/models/XGBoostRegressor_pipeline.joblib",
|
13 |
+
"C:/Users/Afshin/Desktop/10_Projects/Project_1_Tehran_House_Price_Prediction/models/CatBoostRegressor_pipeline.joblib",
|
14 |
+
]
|
15 |
+
|
16 |
+
# Load the cleaned data
|
17 |
+
df = data_cleaning(DATASET_PATH)
|
18 |
+
|
19 |
+
# Prepare data for input fields
|
20 |
+
min_area, max_area = df['Area'].min(), df['Area'].max()
|
21 |
+
rooms = df['Room'].unique().tolist()
|
22 |
+
addresses = df['Address'].unique().tolist()
|
23 |
+
|
24 |
+
# Streamlit app layout
|
25 |
+
st.title("🏠 Tehran House Price Prediction")
|
26 |
+
st.sidebar.header("Input Parameters")
|
27 |
+
|
28 |
+
# Sidebar for input fields
|
29 |
+
st.sidebar.subheader("Enter the details:")
|
30 |
+
area = st.sidebar.number_input("Area (m²)", min_value=min_area, max_value=max_area, value=min_area, step=10)
|
31 |
+
room = st.sidebar.selectbox("Room", options=rooms)
|
32 |
+
parking = st.sidebar.checkbox("Parking", value=True)
|
33 |
+
warehouse = st.sidebar.checkbox("Warehouse", value=True)
|
34 |
+
elevator = st.sidebar.checkbox("Elevator", value=True)
|
35 |
+
address = st.sidebar.selectbox("Address", options=addresses)
|
36 |
+
|
37 |
+
# Prepare input data for prediction
|
38 |
+
sample = pd.DataFrame({
|
39 |
+
'Area': [area],
|
40 |
+
'Room': [room],
|
41 |
+
'Parking': [parking],
|
42 |
+
'Warehouse': [warehouse],
|
43 |
+
'Elevator': [elevator],
|
44 |
+
'Address': [address]
|
45 |
+
})
|
46 |
+
|
47 |
+
def load_and_predict(sample):
|
48 |
+
result = {
|
49 |
+
'Model': [],
|
50 |
+
'R2': [],
|
51 |
+
'Predicted_Price_(IRR)': []
|
52 |
+
}
|
53 |
+
|
54 |
+
# Define features and target variable
|
55 |
+
X = df.drop(columns=['Price']) # Features
|
56 |
+
y = df['Price']
|
57 |
+
|
58 |
+
try:
|
59 |
+
for path in MODELS_PATH:
|
60 |
+
model_name = path.split('/')[-1].split('_')[0]
|
61 |
+
model = joblib.load(path) # Load the model once
|
62 |
+
|
63 |
+
# Predict house price
|
64 |
+
y_pred = model.predict(X)
|
65 |
+
price_pred = model.predict(sample)[0]
|
66 |
+
|
67 |
+
result['Model'].append(model_name)
|
68 |
+
result['R2'].append(r2_score(y, y_pred))
|
69 |
+
result['Predicted_Price_(IRR)'].append(price_pred)
|
70 |
+
|
71 |
+
except Exception as e:
|
72 |
+
st.error(f"An error occurred during model loading or prediction: {str(e)}")
|
73 |
+
return None
|
74 |
+
return pd.DataFrame(result).sort_values(by=['R2'], ascending=False)
|
75 |
+
|
76 |
+
# Predict button
|
77 |
+
if st.sidebar.button("Predict"):
|
78 |
+
result_df = load_and_predict(sample)
|
79 |
+
|
80 |
+
if result_df is not None:
|
81 |
+
st.success('Predicted House Price:')
|
82 |
+
st.table(result_df)
|
83 |
+
|
84 |
+
# Footer or additional information
|
85 |
+
st.sidebar.markdown("### About this App")
|
86 |
+
st.sidebar.markdown(
|
87 |
+
"This app predicts house prices based on input features such as area, number of rooms, "
|
88 |
+
"and facilities like parking, warehouse, and elevator. Please fill in all fields to get the prediction."
|
89 |
+
)
|
datasets/cleaned_housePrice.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
datasets/housePrice.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
models/GradientBoostingRegressor_pipeline.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3b4036d414847c75c050430594d81a1e429dd2c6211fa6b2645902856a462a6b
|
3 |
+
size 3176595
|
models/KernelRidge_pipeline.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:50686c1e622bbf7c4b9c7750673ebfb3a48f3048589979defcdc20602f7655ed
|
3 |
+
size 187296
|
models/XGBoostRegressor_pipeline.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bf25ca293e9d2eed0fd9a997bcd15a94556c77b6e70b441e7f261f16a8f5730f
|
3 |
+
size 1174097
|
models/tehran_house_price_preprocessor.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1f7f26dd9a0ec6db4d9fd4e039f5171ebad96c666d2c1ed34c87949ebb1a2db9
|
3 |
+
size 6273
|
requirements.txt
ADDED
Binary file (144 Bytes). View file
|
|
utils.py
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re, os
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
+
import pandas as pd
|
4 |
+
import requests
|
5 |
+
|
6 |
+
def convert_to_persian_numbers(text: str) -> str:
|
7 |
+
"""Convert English numbers in a string to Persian numbers."""
|
8 |
+
english_to_persian = {
|
9 |
+
'0': '۰',
|
10 |
+
'1': '۱',
|
11 |
+
'2': '۲',
|
12 |
+
'3': '۳',
|
13 |
+
'4': '۴',
|
14 |
+
'5': '۵',
|
15 |
+
'6': '۶',
|
16 |
+
'7': '۷',
|
17 |
+
'8': '۸',
|
18 |
+
'9': '۹'
|
19 |
+
}
|
20 |
+
|
21 |
+
# Replace English digits with Persian digits
|
22 |
+
for eng_digit, persian_digit in english_to_persian.items():
|
23 |
+
text = text.replace(eng_digit, persian_digit)
|
24 |
+
|
25 |
+
return text
|
26 |
+
|
27 |
+
def get_USD_to_IR() -> float:
|
28 |
+
url = 'https://www.tgju.org/profile/price_dollar_rl'
|
29 |
+
response = requests.get(url)
|
30 |
+
if response.status_code != 200:
|
31 |
+
print(f'> Error in fetching {url}: {response.status_code}.')
|
32 |
+
return 300000
|
33 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
34 |
+
per_usd = soup.find('span', {'data-col' : 'info.last_trade.PDrCotVal'}).text.replace(',', '')
|
35 |
+
return float(convert_to_persian_numbers(per_usd))
|
36 |
+
|
37 |
+
def remove_outliers_iqr(data:pd.DataFrame, column_name:str, threshold:float= 1.5) -> pd.DataFrame:
|
38 |
+
"""Remove outliers using IQR method."""
|
39 |
+
Q1 = data[column_name].quantile(0.25)
|
40 |
+
Q3 = data[column_name].quantile(0.75)
|
41 |
+
IQR = Q3 - Q1
|
42 |
+
return data[~((data[column_name] < (Q1 - threshold * IQR)) | (data[column_name] > (Q3 + threshold * IQR)))]
|
43 |
+
|
44 |
+
def data_cleaning(path:str) -> pd.DataFrame:
|
45 |
+
# Step 1 : Load and Prepare data
|
46 |
+
# I : Load the dataset
|
47 |
+
cleaned_data_path = path.rsplit('/', 1)[0] + '/cleaned_housePrice.csv'
|
48 |
+
if os.path.exists(cleaned_data_path):
|
49 |
+
return pd.read_csv(cleaned_data_path)
|
50 |
+
else:
|
51 |
+
df = pd.read_csv(path)
|
52 |
+
|
53 |
+
# II : Update the **Price**
|
54 |
+
today_usd = get_USD_to_IR()
|
55 |
+
# Every USD is equal to 30,000 Tomans (Extra Info).
|
56 |
+
correct_coeff = today_usd / 300000
|
57 |
+
df.Price = df.Price.apply(lambda x: x * correct_coeff * 10)
|
58 |
+
|
59 |
+
# III : Drop irrequired columns
|
60 |
+
df = df.drop(['Price(USD)'], axis= 1)
|
61 |
+
|
62 |
+
# Step 2 : Data Cleaning
|
63 |
+
# I : Correct the datatype of columns
|
64 |
+
df.Area = df.Area.apply(lambda x: re.sub(r'\D', '', str(x)))
|
65 |
+
df.Area = pd.to_numeric(df.Area, errors= 'coerce')
|
66 |
+
|
67 |
+
# II : Handle **missing values**
|
68 |
+
# Drop Null Values
|
69 |
+
df.dropna(ignore_index= True, inplace= True)
|
70 |
+
|
71 |
+
# III : Handle **duplicates**
|
72 |
+
df = df.drop_duplicates(ignore_index= True)
|
73 |
+
|
74 |
+
# IV : Handle **outliers**
|
75 |
+
df = remove_outliers_iqr(df, 'Price')
|
76 |
+
df = remove_outliers_iqr(df, 'Area')
|
77 |
+
df.reset_index(drop= True, inplace= True)
|
78 |
+
|
79 |
+
# V : Save the cleaned dataset
|
80 |
+
df.to_csv(path.rsplit('/', 1)[0] + 'cleaned_housePrice.csv', index= False)
|
81 |
+
|
82 |
+
return df
|
83 |
+
|