@ARTICLE{33704756_941837579_2024, author = {Darya Glebova and Pavel Degtyarev}, keywords = {, employee engagement, employee engagement drivers, content-analysis automation, category universalization, natural language processingmultilabel classification}, title = {The universalization of categories and process automation for content analysis in engagement surveys with open questions (in Russian)}, journal = {Organizational Psychology}, year = {2024}, volume = {14}, number = {2}, pages = {80-111}, url = {https://orgpsyjournal.hse.ru/en/2024-14-2/941837579.html}, publisher = {}, abstract = {This paper aims to develop a solution for content analysis automation for answers to openended questions in engagement studies. Responses to two open questions by more than 16,000 employees of five Russian companies were processed using a natural language processing method and a multilabel classification. The categories used for classification were obtained by building a co-occurrence matrixfrom the results of manual content analysis. The solution contains two separate models developed with supervised machine learning algorithms — random forest and gradient boosting. These model sautomate the content analysis process for answers to open questions in engagement studies with the accuracy of .86 and .70 respectively. Aggregated and potentially universal categories of engagement studies were identified, the categories being independent of the context of data collection. They also were compared with the engagement factors defined by other authors. Besides, the most important speech patterns (words and phrases) that predict to which of the categories a particular employee’s response would be assigned were named. The solution can be used by companies for benchmarking factors of engagement in different domain areas. Research opportunities for the categories as regards their possible internal relationship are discussed. Improvement of the tool is also considered, as well as adding topic modeling and sentiment analysis methods to refine its predictive power.}, annote = {This paper aims to develop a solution for content analysis automation for answers to openended questions in engagement studies. Responses to two open questions by more than 16,000 employees of five Russian companies were processed using a natural language processing method and a multilabel classification. The categories used for classification were obtained by building a co-occurrence matrixfrom the results of manual content analysis. The solution contains two separate models developed with supervised machine learning algorithms — random forest and gradient boosting. These model sautomate the content analysis process for answers to open questions in engagement studies with the accuracy of .86 and .70 respectively. Aggregated and potentially universal categories of engagement studies were identified, the categories being independent of the context of data collection. They also were compared with the engagement factors defined by other authors. Besides, the most important speech patterns (words and phrases) that predict to which of the categories a particular employee’s response would be assigned were named. The solution can be used by companies for benchmarking factors of engagement in different domain areas. Research opportunities for the categories as regards their possible internal relationship are discussed. Improvement of the tool is also considered, as well as adding topic modeling and sentiment analysis methods to refine its predictive power.} }