{ "cells": [ { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import hopsworks\n", "import os\n", "import re\n", "from dotenv import load_dotenv" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2025-01-08 19:51:38,754 INFO: Closing external client and cleaning up certificates.\n", "Connection closed.\n", "2025-01-08 19:51:38,758 INFO: Initializing external client\n", "2025-01-08 19:51:38,758 INFO: Base URL: https://c.app.hopsworks.ai:443\n", "2025-01-08 19:51:39,828 INFO: Python Engine initialized.\n", "\n", "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1158296\n" ] } ], "source": [ "load_dotenv()\n", "\n", "api_key = os.getenv(\"HOPSWORKS_API_KEY\")\n", "project = hopsworks.login(project=\"orestavf\", api_key_value=api_key)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "fs = project.get_feature_store()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# Retrieve feature groups\n", "feedback_fg = fs.get_feature_group(name=\"job_feedback\", version=1)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.93s) \n" ] } ], "source": [ "feedback_df = feedback_fg.read()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
job_idresume_textjob_headlinejob_occupationjob_descriptionis_relevant
029321628Filip Orestav \\nTransformatorvägen 6, Sollent...Junior Projektadmin till talangprogram på AFRY...Projektledare, bygg och anläggningVill du kickstarta din karriär hos en av Sveri...True
\n", "
" ], "text/plain": [ " job_id resume_text \\\n", "0 29321628 Filip Orestav \\nTransformatorvägen 6, Sollent... \n", "\n", " job_headline \\\n", "0 Junior Projektadmin till talangprogram på AFRY... \n", "\n", " job_occupation \\\n", "0 Projektledare, bygg och anläggning \n", "\n", " job_description is_relevant \n", "0 Vill du kickstarta din karriär hos en av Sveri... True " ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "feedback_df.head()" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "# Columns to preprocess\n", "columns_to_process = ['resume_text', 'job_headline', 'job_occupation', 'job_description']" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "# Define preprocessing functions\n", "def preprocess_text(text):\n", " if isinstance(text, str):\n", " # Lowercase\n", " text = text.lower()\n", " # Remove special characters (preserving letters, numbers, and spaces)\n", " text = re.sub(r\"[^a-zåäöA-Z0-9\\s]\", \"\", text)\n", " # Remove extra spaces\n", " text = re.sub(r\"\\s+\", \" \", text)\n", " return text.strip() # Strip leading/trailing spaces\n", " return text" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2025-01-08 18:38:35,968 WARNING: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
job_idresume_textjob_headlinejob_occupationjob_descriptionis_relevant
029321628filip orestav transformatorvägen 6 sollentuna ...junior projektadmin till talangprogram på afry...projektledare bygg och anläggningvill du kickstarta din karriär hos en av sveri...True
\n", "
" ], "text/plain": [ " job_id resume_text \\\n", "0 29321628 filip orestav transformatorvägen 6 sollentuna ... \n", "\n", " job_headline \\\n", "0 junior projektadmin till talangprogram på afry... \n", "\n", " job_occupation \\\n", "0 projektledare bygg och anläggning \n", "\n", " job_description is_relevant \n", "0 vill du kickstarta din karriär hos en av sveri... True " ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Apply preprocessing\n", "feedback_df[columns_to_process] = feedback_df[columns_to_process].applymap(preprocess_text)\n", "\n", "# Display processed dataframe\n", "feedback_df.head()" ] } ], "metadata": { "kernelspec": { "display_name": "venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.2" } }, "nbformat": 4, "nbformat_minor": 2 }