{"id":"https://openalex.org/W4401864268","doi":"https://doi.org/10.1145/3637528.3671873","title":"Efficient Mixture of Experts based on Large Language Models for Low-Resource Data Preprocessing","display_name":"Efficient Mixture of Experts based on Large Language Models for Low-Resource Data Preprocessing","publication_year":2024,"publication_date":"2024-08-24","ids":{"openalex":"https://openalex.org/W4401864268","doi":"https://doi.org/10.1145/3637528.3671873"},"language":"en","primary_location":{"id":"doi:10.1145/3637528.3671873","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3637528.3671873","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 30th ACM SIGKDD Conference on Knowledge Discovery and Data Mining","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5002103818","display_name":"Mengyi Yan","orcid":"https://orcid.org/0009-0006-9991-0267"},"institutions":[{"id":"https://openalex.org/I82880672","display_name":"Beihang University","ror":"https://ror.org/00wk2mp56","country_code":"CN","type":"education","lineage":["https://openalex.org/I82880672"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Mengyi Yan","raw_affiliation_strings":["Beihang University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Beihang University, Beijing, China","institution_ids":["https://openalex.org/I82880672"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5068930286","display_name":"Yaoshu Wang","orcid":"https://orcid.org/0000-0002-5760-5145"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yaoshu Wang","raw_affiliation_strings":["Shenzhen Institute of Computing Sciences, Shenzhen, China"],"affiliations":[{"raw_affiliation_string":"Shenzhen Institute of Computing Sciences, Shenzhen, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5085916746","display_name":"K.K. Pang","orcid":"https://orcid.org/0009-0006-4086-1421"},"institutions":[{"id":"https://openalex.org/I82880672","display_name":"Beihang University","ror":"https://ror.org/00wk2mp56","country_code":"CN","type":"education","lineage":["https://openalex.org/I82880672"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Kehan Pang","raw_affiliation_strings":["Beihang University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Beihang University, Beijing, China","institution_ids":["https://openalex.org/I82880672"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5031241828","display_name":"Min Xie","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Min Xie","raw_affiliation_strings":["Shenzhen Institute of Computing Sciences, Shenzhen, China"],"affiliations":[{"raw_affiliation_string":"Shenzhen Institute of Computing Sciences, Shenzhen, China","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100380463","display_name":"Jianxin Li","orcid":"https://orcid.org/0000-0001-5152-0055"},"institutions":[{"id":"https://openalex.org/I82880672","display_name":"Beihang University","ror":"https://ror.org/00wk2mp56","country_code":"CN","type":"education","lineage":["https://openalex.org/I82880672"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jianxin Li","raw_affiliation_strings":["Beihang University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Beihang University, Beijing, China","institution_ids":["https://openalex.org/I82880672"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5002103818"],"corresponding_institution_ids":["https://openalex.org/I82880672"],"apc_list":null,"apc_paid":null,"fwci":3.9259,"has_fulltext":false,"cited_by_count":8,"citation_normalized_percentile":{"value":0.93676313,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":90,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"3690","last_page":"3701"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9889000058174133,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9889000058174133,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11106","display_name":"Data Management and Algorithms","score":0.9799000024795532,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T14280","display_name":"Big Data Technologies and Applications","score":0.9782000184059143,"subfield":{"id":"https://openalex.org/subfields/1802","display_name":"Information Systems and Management"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7890526056289673},{"id":"https://openalex.org/keywords/preprocessor","display_name":"Preprocessor","score":0.6280888915061951},{"id":"https://openalex.org/keywords/resource","display_name":"Resource (disambiguation)","score":0.5530489683151245},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.49160417914390564},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.44593220949172974},{"id":"https://openalex.org/keywords/data-pre-processing","display_name":"Data pre-processing","score":0.4279499650001526},{"id":"https://openalex.org/keywords/data-modeling","display_name":"Data modeling","score":0.41433337330818176},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.35464563965797424},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.3444393575191498},{"id":"https://openalex.org/keywords/database","display_name":"Database","score":0.17702743411064148}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7890526056289673},{"id":"https://openalex.org/C34736171","wikidata":"https://www.wikidata.org/wiki/Q918333","display_name":"Preprocessor","level":2,"score":0.6280888915061951},{"id":"https://openalex.org/C206345919","wikidata":"https://www.wikidata.org/wiki/Q20380951","display_name":"Resource (disambiguation)","level":2,"score":0.5530489683151245},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.49160417914390564},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.44593220949172974},{"id":"https://openalex.org/C10551718","wikidata":"https://www.wikidata.org/wiki/Q5227332","display_name":"Data pre-processing","level":2,"score":0.4279499650001526},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.41433337330818176},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.35464563965797424},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.3444393575191498},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.17702743411064148},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3637528.3671873","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3637528.3671873","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 30th ACM SIGKDD Conference on Knowledge Discovery and Data Mining","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":63,"referenced_works":["https://openalex.org/W1964786778","https://openalex.org/W1992479406","https://openalex.org/W2026147624","https://openalex.org/W2044469685","https://openalex.org/W2059554611","https://openalex.org/W2081186682","https://openalex.org/W2107966677","https://openalex.org/W2152321560","https://openalex.org/W2524620548","https://openalex.org/W2798323405","https://openalex.org/W2798649495","https://openalex.org/W2809290718","https://openalex.org/W2901288224","https://openalex.org/W2929941791","https://openalex.org/W2943501111","https://openalex.org/W2946504770","https://openalex.org/W2964133178","https://openalex.org/W2970641574","https://openalex.org/W3000214033","https://openalex.org/W3014295153","https://openalex.org/W3014705052","https://openalex.org/W3015738839","https://openalex.org/W3030496122","https://openalex.org/W3032215537","https://openalex.org/W3034733191","https://openalex.org/W3035623224","https://openalex.org/W3037285953","https://openalex.org/W3082197983","https://openalex.org/W3099700870","https://openalex.org/W3105977086","https://openalex.org/W3123375411","https://openalex.org/W3145728363","https://openalex.org/W3168406300","https://openalex.org/W3169654496","https://openalex.org/W3170796112","https://openalex.org/W3171188212","https://openalex.org/W3174036215","https://openalex.org/W3174588372","https://openalex.org/W3194187125","https://openalex.org/W3197468999","https://openalex.org/W3198445485","https://openalex.org/W3201693442","https://openalex.org/W4224919569","https://openalex.org/W4226079124","https://openalex.org/W4281826654","https://openalex.org/W4282028729","https://openalex.org/W4287121295","https://openalex.org/W4287124167","https://openalex.org/W4317767732","https://openalex.org/W4380433147","https://openalex.org/W4385573918","https://openalex.org/W4385885462","https://openalex.org/W4386298181","https://openalex.org/W4387321091","https://openalex.org/W4387846779","https://openalex.org/W4389520786","https://openalex.org/W4389609719","https://openalex.org/W6600292188","https://openalex.org/W6759363029","https://openalex.org/W6796854725","https://openalex.org/W6810737565","https://openalex.org/W6859498035","https://openalex.org/W6863071542"],"related_works":["https://openalex.org/W2989490741","https://openalex.org/W3092506759","https://openalex.org/W2367545121","https://openalex.org/W4248881655","https://openalex.org/W2482165163","https://openalex.org/W3010890513","https://openalex.org/W120741642","https://openalex.org/W138569904","https://openalex.org/W2390914021","https://openalex.org/W2389417819"],"abstract_inverted_index":{"Data":[0],"preprocessing":[1],"(DP)":[2],"that":[3,77,135,172],"transforms":[4],"erroneous":[5],"and":[6,31,81,100,117,145,181,199],"raw":[7],"data":[8,18,29,113,153],"to":[9,22,35,151,154,170,188],"a":[10,14,65,73,96,104,142,192,196],"clean":[11],"version":[12],"is":[13,47,139,149,186],"cornerstone":[15],"of":[16,26,56,83,98,130],"the":[17,23,79,128,146,155,175],"mining":[19],"pipeline.":[20],"Due":[21],"diverse":[24],"requirements":[25],"downstream":[27],"tasks,":[28],"scientists":[30],"domain":[32],"experts":[33,85],"have":[34],"handcraft":[36],"domain-specific":[37,84],"rules":[38],"or":[39],"train":[40],"ML":[41],"models":[42],"with":[43],"annotated":[44,89],"examples,":[45],"which":[46],"costly/time-consuming.":[48],"In":[49],"this":[50],"paper,":[51],"we":[52,94,132,159],"present":[53],"MELD":[54,71,138,173,185],"(<u>M</u>ixture":[55],"<u>E</u>xperts":[57],"on":[58,87,122,163],"<u>L</u>arge":[59],"Language":[60],"Models":[61],"for":[62,68,112],"<u>D</u>ata":[63],"Preprocessing),":[64],"universal":[66],"solver":[67],"low-resource":[69,193],"DP.":[70],"adopts":[72],"Mixture-of-Experts":[74],"(MoE)":[75],"architecture":[76],"enables":[78],"amalgamation":[80],"enhancement":[82],"trained":[86],"limited":[88],"examples.":[90],"To":[91,125],"fine-tune":[92],"MELD,":[93,131],"develop":[95],"suite":[97],"expert-tuning":[99],"MoE-tuning":[101],"techniques,":[102],"including":[103],"retrieval":[105],"augmented":[106],"generation":[107],"(RAG)":[108],"system,":[109],"meta-path":[110],"search":[111],"augmentation,":[114],"expert":[115,144],"refinement":[116],"router":[118,147],"network":[119,148],"training":[120],"based":[121],"information":[123],"bottleneck.":[124],"further":[126],"verify":[127],"effectiveness":[129,180],"theoretically":[133],"prove":[134],"MoE":[136],"in":[137,178,191],"superior":[140],"than":[141],"single":[143,198],"able":[150,187],"dispatch":[152],"right":[156],"experts.":[157],"Finally,":[158],"conducted":[160],"extensive":[161],"experiments":[162],"19":[164],"datasets":[165],"over":[166],"10":[167],"DP":[168],"tasks":[169],"show":[171],"outperforms":[174],"state-of-the-art":[176],"methods":[177],"both":[179],"efficiency.":[182],"More":[183],"importantly,":[184],"be":[189],"fine-tuned":[190],"environment,":[194],"e.g.":[195],"local,":[197],"low-priced":[200],"3090":[201],"GPU.":[202]},"counts_by_year":[{"year":2026,"cited_by_count":2},{"year":2025,"cited_by_count":5},{"year":2024,"cited_by_count":1}],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-10-10T00:00:00"}
