Hybrid Ticket Classification & Priority Prediction System is a machine learning and NLP-based solution designed to automate IT helpdesk processes. The system intelligently analyzes support ticket descriptions, employee performance data, and operational metrics to predict key outcomes such as ticket priority, category, and expected resolution behavior.

This project integrates textual ticket data (NLP) and structured employee/ticket features, creating a hybrid dataset that mirrors a real IT support environment. Using techniques like TF-IDF vectorization, label encoding, feature scaling, and a Random Forest Classifier, the model learns patterns in how tickets are handled and predicts outcomes with high accuracy.

The goal of the system is to help organizations automate manual support tasks, reduce response delays, and improve customer satisfaction by enabling data-driven decision-making.


🔑 Key Highlights

Combines NLP + tabular ML to make hybrid predictions

Performs ticket classification based on description

Predicts ticket priority levels for faster resolution

Includes employee experience, rating, resolution time as predictive features

Uses Random Forest Classifier for robust and interpretable results

End-to-end workflow: EDA → Preprocessing → Model Training → Prediction


🎯 Use Cases

IT Helpdesk Automation

Customer Support Ticket Routing

SLA (Service Level Agreement) Optimization

Workforce Management

Productivity Analysis

Code block example

import numpy as np
import pandas as pd
#reaload adataset
dataset=pd.read_csv(r"C:\Users\pintu\Downloads\customer_support_tickets.csv")
 # Create Employee Dataset Generate random help by numpy (50 employees example)
num_employees = 50
employee_data = pd.DataFrame({
    'Employee_ID': range(1, num_employees+1),
    'Department': np.random.choice(['IT Support', 'Network', 'Software', 'Hardware'],num_employees),
    'Experience_Years': np.random.randint(1, 15, num_employees),
    'Avg_Resolution_Time': np.random.randint(10, 120, num_employees),   # (Time in minutes)
    'Tickets_Handled': np.random.randint(50, 300, num_employees),
    'Customer_Rating_Avg': np.random.uniform(2.5, 5.0, num_employees).round(2)
})
# Merge in dataset  information of employee (Hybrid_dataset)
hybrid_dataset = dataset.merge(employee_data,
                               left_on='Assigned_Employee_ID',
                               right_on='Employee_ID',
                               how='left')

print("Hybrid Dataset Shape:", hybrid_dataset.shape)
# save finally dataset(hybrid_dataset)
hybrid_dataset.to_csv(r"C:\Users\pintu\OneDrive\Desktop\HCL\final_hybrid_ticket_employee_dataset1.csv",index=False)
print("✅ Hybrid dataset ready and saved as hybrid_ticket_employee_dataset.csv")
from sklearn.preprocessing import LabelEncoder,StandardScaler
# encode
le_dept = LabelEncoder()
le_priority = LabelEncoder()
new_dataset['Department_Label'] = le_dept.fit_transform(new_dataset['Department'])
new_dataset['Priority_Label'] = le_priority.fit_transform(new_dataset['Ticket Priority'])
# Regression who is 
new_dataset['Resolution_Time'] = new_dataset['Avg_Resolution_Time']  # assuming avg_resolution_time as this columns
from sklearn.feature_extraction.text import TfidfVectorizer
#Split Features and Targets columns
text_feature = new_dataset['Ticket Description']
numerical_features = new_dataset[['Experience_Years','Avg_Resolution_Time','Tickets_Handled','Customer_Rating_Avg']]
categorical_features = pd.get_dummies(new_dataset['Assigned_Employee_ID'],prefix='Emp')
X_text = text_feature # text columns in this 
X_num = numerical_features.values # numerical columns in this
X_cat = categorical_features.values # catogerical columns in this
# combine(multiple) features later after vectorizing text
y_class = new_dataset[['Department_Label','Priority_Label']]

y_reg = new_dataset['Resolution_Time'].values
#text vectorization (TF-IDF)
tfidf = TfidfVectorizer(max_features=5000,stop_words='english')
X_text_vect = tfidf.fit_transform(X_text)
# Numerical scaling (standard scaler)
scaler = StandardScaler()
X_num_scaled = scaler.fit_transform(X_num)

# train test split
from sklearn.model_selection import train_test_split
from scipy.sparse import hstack
#combine all features
X_full = hstack([X_text_vect,X_num_scaled, X_cat])

# step 13 train_test_split
X_train, X_test, y_train_class, y_test_class, y_train_reg, y_test_reg = train_test_split(
    X_full, y_class, y_reg, test_size=0.2, random_state=42
)
# step 13 import librery
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.multioutput import MultiOutputClassifier
clf = RandomForestClassifier(
    n_estimators=200,
    max_depth=15,
    min_samples_split=10,
    min_samples_leaf=5,
    max_features='sqrt',
    class_weight='balanced',
    random_state=42
)
multi_clf = MultiOutputClassifier(clf, n_jobs=-1)
multi_clf.fit(X_train,y_train_class)
#  regression model for resolution time
reg_model = RandomForestRegressor(n_estimators=200,random_state=42)
reg_model.fit(X_train, y_train_reg)
# Evaluation
y_pred_class = multi_clf.predict(X_test)
y_pred_reg = reg_model.predict(X_test)
from sklearn.metrics import classification_report, accuracy_score, mean_squared_error
#  Classification report
print(" Department & Priority Prediction ")
print("Department Accuracy:",accuracy_score(y_test_class['Department_Label'], y_pred_class[:,0]))
print("Priority Accuracy:", accuracy_score(y_test_class['Priority_Label'], y_pred_class[:,1]))
from scipy.sparse import hstacka
# function for Single Ticket prediction
def predict_ticket(ticket_description, employee_id, experience, avg_time, tickets_handled, rating):
    # Text
    text_vect = tfidf.transform([ticket_description])
    # Numerical
    num_scaled = scaler.transform([[experience, avg_time, tickets_handled, rating]])
    # Categorical
    cat_df = pd.DataFrame({f'Emp_{employee_id}':[1]})
    for col in categorical_features.columns:
        if col not in cat_df.columns:
            cat_df[col] = 0
    cat_df = cat_df[categorical_features.columns]
    cat_array = cat_df.values
    # Combine
    X_input = hstack([text_vect, num_scaled, cat_array])
    # Predict
    pred_class = multi_clf.predict(X_input)
    pred_reg = reg_model.predict(X_input)[0]
    dept_pred = le_dept.inverse_transform([pred_class[0][0]])[0]
    priority_pred = le_priority.inverse_transform([pred_class[0][1]])[0]
    return {
        "Predicted_Department": dept_pred,
        "Predicted_Priority": priority_pred,
        "Predicted_Resolution_Time": round(pred_reg,2)
    }


# Example Prediction
sample_ticket = predict_ticket(
    "User cannot connect to VPN after password reset",
    employee_id=45,
    experience=3,
    avg_time=45,
    tickets_handled=456,
    rating=4.0
)
print("\nSample Ticket Prediction:\n",sample_ticket)













Share this project:

Updates