{"id":"https://openalex.org/W7133124280","doi":"https://doi.org/10.3390/info17030233","title":"On the Task of Job Posting Deduplication Using Embedding-Based Filtering and LLM Validation","display_name":"On the Task of Job Posting Deduplication Using Embedding-Based Filtering and LLM Validation","publication_year":2026,"publication_date":"2026-03-01","ids":{"openalex":"https://openalex.org/W7133124280","doi":"https://doi.org/10.3390/info17030233"},"language":"en","primary_location":{"id":"doi:10.3390/info17030233","is_oa":true,"landing_page_url":"https://doi.org/10.3390/info17030233","pdf_url":"https://www.mdpi.com/2078-2489/17/3/233/pdf?version=1772354304","source":{"id":"https://openalex.org/S4210219776","display_name":"Information","issn_l":"2078-2489","issn":["2078-2489"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310310987","host_organization_name":"Multidisciplinary Digital Publishing Institute","host_organization_lineage":["https://openalex.org/P4310310987"],"host_organization_lineage_names":["Multidisciplinary Digital Publishing Institute"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Information","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","doaj"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://www.mdpi.com/2078-2489/17/3/233/pdf?version=1772354304","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5118216683","display_name":"Giannis Thivaios","orcid":null},"institutions":[{"id":"https://openalex.org/I158716096","display_name":"University of Peloponnese","ror":"https://ror.org/04d4d3c02","country_code":"GR","type":"education","lineage":["https://openalex.org/I158716096"]}],"countries":["GR"],"is_corresponding":false,"raw_author_name":"Giannis Thivaios","raw_affiliation_strings":["Data and Media Laboratory, Department of Electrical and Computer Engineering, University of Peloponnese, 26334 Patras, Greece"],"raw_orcid":"https://orcid.org/0009-0005-2969-8881","affiliations":[{"raw_affiliation_string":"Data and Media Laboratory, Department of Electrical and Computer Engineering, University of Peloponnese, 26334 Patras, Greece","institution_ids":["https://openalex.org/I158716096"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5109328120","display_name":"P Zervas","orcid":"https://orcid.org/0000-0003-2710-1066"},"institutions":[{"id":"https://openalex.org/I158716096","display_name":"University of Peloponnese","ror":"https://ror.org/04d4d3c02","country_code":"GR","type":"education","lineage":["https://openalex.org/I158716096"]}],"countries":["GR"],"is_corresponding":false,"raw_author_name":"Panagiotis Zervas","raw_affiliation_strings":["Data and Media Laboratory, Department of Electrical and Computer Engineering, University of Peloponnese, 26334 Patras, Greece"],"raw_orcid":"https://orcid.org/0000-0003-2710-1066","affiliations":[{"raw_affiliation_string":"Data and Media Laboratory, Department of Electrical and Computer Engineering, University of Peloponnese, 26334 Patras, Greece","institution_ids":["https://openalex.org/I158716096"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5070478282","display_name":"Konstantinos C. Giotopoulos","orcid":"https://orcid.org/0000-0001-5989-6313"},"institutions":[{"id":"https://openalex.org/I174878644","display_name":"University of Patras","ror":"https://ror.org/017wvtq80","country_code":"GR","type":"education","lineage":["https://openalex.org/I174878644"]}],"countries":["GR"],"is_corresponding":false,"raw_author_name":"Konstantinos Giotopoulos","raw_affiliation_strings":["Department of Management Science and Technology, University of Patras, 26334 Patras, Greece"],"raw_orcid":"https://orcid.org/0000-0001-5989-6313","affiliations":[{"raw_affiliation_string":"Department of Management Science and Technology, University of Patras, 26334 Patras, Greece","institution_ids":["https://openalex.org/I174878644"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5039150569","display_name":"Giannis Tzimas","orcid":"https://orcid.org/0000-0002-4073-7256"},"institutions":[{"id":"https://openalex.org/I158716096","display_name":"University of Peloponnese","ror":"https://ror.org/04d4d3c02","country_code":"GR","type":"education","lineage":["https://openalex.org/I158716096"]}],"countries":["GR"],"is_corresponding":true,"raw_author_name":"Giannis Tzimas","raw_affiliation_strings":["Data and Media Laboratory, Department of Electrical and Computer Engineering, University of Peloponnese, 26334 Patras, Greece"],"raw_orcid":"https://orcid.org/0000-0002-4073-7256","affiliations":[{"raw_affiliation_string":"Data and Media Laboratory, Department of Electrical and Computer Engineering, University of Peloponnese, 26334 Patras, Greece","institution_ids":["https://openalex.org/I158716096"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5039150569"],"corresponding_institution_ids":["https://openalex.org/I158716096"],"apc_list":{"value":1400,"currency":"CHF","value_usd":1515},"apc_paid":{"value":1400,"currency":"CHF","value_usd":1515},"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.31533827,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"17","issue":"3","first_page":"233","last_page":"233"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.334199994802475,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.334199994802475,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11273","display_name":"Advanced Graph Neural Networks","score":0.07760000228881836,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13274","display_name":"Expert finding and Q&A systems","score":0.05640000104904175,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/data-deduplication","display_name":"Data deduplication","score":0.7275999784469604},{"id":"https://openalex.org/keywords/preprocessor","display_name":"Preprocessor","score":0.6486999988555908},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.6011000275611877},{"id":"https://openalex.org/keywords/pairwise-comparison","display_name":"Pairwise comparison","score":0.5529000163078308},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5493999719619751},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.5360000133514404},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.4903999865055084},{"id":"https://openalex.org/keywords/equivalence","display_name":"Equivalence (formal languages)","score":0.3853999972343445}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8242999911308289},{"id":"https://openalex.org/C32587265","wikidata":"https://www.wikidata.org/wiki/Q1182260","display_name":"Data deduplication","level":2,"score":0.7275999784469604},{"id":"https://openalex.org/C34736171","wikidata":"https://www.wikidata.org/wiki/Q918333","display_name":"Preprocessor","level":2,"score":0.6486999988555908},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.6011000275611877},{"id":"https://openalex.org/C184898388","wikidata":"https://www.wikidata.org/wiki/Q1435712","display_name":"Pairwise comparison","level":2,"score":0.5529000163078308},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5493999719619751},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.5360000133514404},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.5109999775886536},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.4903999865055084},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4325000047683716},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.41260001063346863},{"id":"https://openalex.org/C2780069185","wikidata":"https://www.wikidata.org/wiki/Q7977945","display_name":"Equivalence (formal languages)","level":2,"score":0.3853999972343445},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3537999987602234},{"id":"https://openalex.org/C73555534","wikidata":"https://www.wikidata.org/wiki/Q622825","display_name":"Cluster analysis","level":2,"score":0.3391000032424927},{"id":"https://openalex.org/C179799912","wikidata":"https://www.wikidata.org/wiki/Q205084","display_name":"Computational complexity theory","level":2,"score":0.33059999346733093},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.31630000472068787},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.3041999936103821},{"id":"https://openalex.org/C130318100","wikidata":"https://www.wikidata.org/wiki/Q2268914","display_name":"Semantic similarity","level":2,"score":0.303600013256073},{"id":"https://openalex.org/C2777462759","wikidata":"https://www.wikidata.org/wiki/Q18395344","display_name":"Word embedding","level":3,"score":0.28700000047683716},{"id":"https://openalex.org/C51823790","wikidata":"https://www.wikidata.org/wiki/Q504353","display_name":"Greedy algorithm","level":2,"score":0.28360000252723694},{"id":"https://openalex.org/C10551718","wikidata":"https://www.wikidata.org/wiki/Q5227332","display_name":"Data pre-processing","level":2,"score":0.272599995136261},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.2639000117778778}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.3390/info17030233","is_oa":true,"landing_page_url":"https://doi.org/10.3390/info17030233","pdf_url":"https://www.mdpi.com/2078-2489/17/3/233/pdf?version=1772354304","source":{"id":"https://openalex.org/S4210219776","display_name":"Information","issn_l":"2078-2489","issn":["2078-2489"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310310987","host_organization_name":"Multidisciplinary Digital Publishing Institute","host_organization_lineage":["https://openalex.org/P4310310987"],"host_organization_lineage_names":["Multidisciplinary Digital Publishing Institute"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Information","raw_type":"journal-article"},{"id":"pmh:oai:doaj.org/article:7081868c8ff64ccea124605895dfe07f","is_oa":true,"landing_page_url":"https://doaj.org/article/7081868c8ff64ccea124605895dfe07f","pdf_url":null,"source":{"id":"https://openalex.org/S4306401280","display_name":"DOAJ (DOAJ: Directory of Open Access Journals)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-sa","license_id":"https://openalex.org/licenses/cc-by-sa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Information, Vol 17, Iss 3, p 233 (2026)","raw_type":"article"}],"best_oa_location":{"id":"doi:10.3390/info17030233","is_oa":true,"landing_page_url":"https://doi.org/10.3390/info17030233","pdf_url":"https://www.mdpi.com/2078-2489/17/3/233/pdf?version=1772354304","source":{"id":"https://openalex.org/S4210219776","display_name":"Information","issn_l":"2078-2489","issn":["2078-2489"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310310987","host_organization_name":"Multidisciplinary Digital Publishing Institute","host_organization_lineage":["https://openalex.org/P4310310987"],"host_organization_lineage_names":["Multidisciplinary Digital Publishing Institute"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Information","raw_type":"journal-article"},"sustainable_development_goals":[{"display_name":"Decent work and economic growth","id":"https://metadata.un.org/sdg/8","score":0.5026791095733643}],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W7133124280.pdf","grobid_xml":"https://content.openalex.org/works/W7133124280.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"This":[0],"paper":[1],"addresses":[2],"the":[3,50,111,116,121,138],"challenge":[4],"of":[5,39,53,100],"deduplicating":[6],"job":[7,85,102],"postings":[8],"in":[9,84],"large,":[10],"heterogeneous":[11],"datasets":[12],"by":[13],"introducing":[14],"an":[15,97],"efficient,":[16],"multi-stage":[17],"methodology":[18],"that":[19,137],"combines":[20],"embedding-based":[21],"filtering":[22],"with":[23,33,129],"Large":[24],"Language":[25],"Model":[26],"(LLM)":[27],"validation.":[28,109],"The":[29,91],"proposed":[30,92,139],"system":[31,93],"begins":[32],"data":[34],"preprocessing":[35],"and":[36,89],"semantic":[37,79,144],"vectorization":[38],"key":[40],"textual":[41],"fields":[42],"using":[43,75],"a":[44,57],"text":[45],"embedding":[46],"model.":[47],"To":[48],"reduce":[49],"computational":[51,132],"cost":[52],"exhaustive":[54],"pairwise":[55],"comparisons,":[56],"clustering-based":[58],"grouping":[59],"mechanism":[60],"is":[61,94],"employed":[62],"to":[63,66],"restrict":[64],"comparisons":[65],"semantically":[67],"coherent":[68],"clusters.":[69],"Candidate":[70],"duplicates":[71],"are":[72],"then":[73],"validated":[74],"LLMs,":[76],"which":[77],"assess":[78],"equivalence":[80],"across":[81],"highlighted":[82],"differences":[83],"titles,":[86],"descriptions,":[87],"companies,":[88],"locations.":[90],"evaluated":[95,112],"on":[96],"augmented":[98],"dataset":[99],"50,000":[101],"postings,":[103],"producing":[104],"6669":[105],"candidate":[106],"pairs":[107],"for":[108,149],"Among":[110],"models,":[113],"GPT-4o":[114],"achieved":[115],"highest":[117],"F1-score":[118],"(95.10%),":[119],"while":[120,146],"lightweight":[122],"Phi-4":[123],"model":[124],"demonstrated":[125],"competitive":[126],"performance":[127],"(92.58%)":[128],"significantly":[130],"lower":[131],"cost.":[133],"These":[134],"findings":[135],"demonstrate":[136],"hybrid":[140],"framework":[141],"achieves":[142],"high":[143],"precision":[145],"remaining":[147],"scalable":[148],"continuous":[150],"large-scale":[151],"deployment.":[152]},"counts_by_year":[],"updated_date":"2026-05-21T06:26:12.895304","created_date":"2026-03-02T00:00:00"}
