{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import hopsworks\n",
"import os\n",
"import re\n",
"from dotenv import load_dotenv"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2025-01-08 19:51:38,754 INFO: Closing external client and cleaning up certificates.\n",
"Connection closed.\n",
"2025-01-08 19:51:38,758 INFO: Initializing external client\n",
"2025-01-08 19:51:38,758 INFO: Base URL: https://c.app.hopsworks.ai:443\n",
"2025-01-08 19:51:39,828 INFO: Python Engine initialized.\n",
"\n",
"Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1158296\n"
]
}
],
"source": [
"load_dotenv()\n",
"\n",
"api_key = os.getenv(\"HOPSWORKS_API_KEY\")\n",
"project = hopsworks.login(project=\"orestavf\", api_key_value=api_key)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"fs = project.get_feature_store()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# Retrieve feature groups\n",
"feedback_fg = fs.get_feature_group(name=\"job_feedback\", version=1)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.93s) \n"
]
}
],
"source": [
"feedback_df = feedback_fg.read()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" job_id | \n",
" resume_text | \n",
" job_headline | \n",
" job_occupation | \n",
" job_description | \n",
" is_relevant | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 29321628 | \n",
" Filip Orestav \\nTransformatorvägen 6, Sollent... | \n",
" Junior Projektadmin till talangprogram på AFRY... | \n",
" Projektledare, bygg och anläggning | \n",
" Vill du kickstarta din karriär hos en av Sveri... | \n",
" True | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" job_id resume_text \\\n",
"0 29321628 Filip Orestav \\nTransformatorvägen 6, Sollent... \n",
"\n",
" job_headline \\\n",
"0 Junior Projektadmin till talangprogram på AFRY... \n",
"\n",
" job_occupation \\\n",
"0 Projektledare, bygg och anläggning \n",
"\n",
" job_description is_relevant \n",
"0 Vill du kickstarta din karriär hos en av Sveri... True "
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"feedback_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"# Columns to preprocess\n",
"columns_to_process = ['resume_text', 'job_headline', 'job_occupation', 'job_description']"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"# Define preprocessing functions\n",
"def preprocess_text(text):\n",
" if isinstance(text, str):\n",
" # Lowercase\n",
" text = text.lower()\n",
" # Remove special characters (preserving letters, numbers, and spaces)\n",
" text = re.sub(r\"[^a-zåäöA-Z0-9\\s]\", \"\", text)\n",
" # Remove extra spaces\n",
" text = re.sub(r\"\\s+\", \" \", text)\n",
" return text.strip() # Strip leading/trailing spaces\n",
" return text"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2025-01-08 18:38:35,968 WARNING: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.\n",
"\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" job_id | \n",
" resume_text | \n",
" job_headline | \n",
" job_occupation | \n",
" job_description | \n",
" is_relevant | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 29321628 | \n",
" filip orestav transformatorvägen 6 sollentuna ... | \n",
" junior projektadmin till talangprogram på afry... | \n",
" projektledare bygg och anläggning | \n",
" vill du kickstarta din karriär hos en av sveri... | \n",
" True | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" job_id resume_text \\\n",
"0 29321628 filip orestav transformatorvägen 6 sollentuna ... \n",
"\n",
" job_headline \\\n",
"0 junior projektadmin till talangprogram på afry... \n",
"\n",
" job_occupation \\\n",
"0 projektledare bygg och anläggning \n",
"\n",
" job_description is_relevant \n",
"0 vill du kickstarta din karriär hos en av sveri... True "
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Apply preprocessing\n",
"feedback_df[columns_to_process] = feedback_df[columns_to_process].applymap(preprocess_text)\n",
"\n",
"# Display processed dataframe\n",
"feedback_df.head()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}