{"id":"https://openalex.org/W4414771487","doi":"https://doi.org/10.1109/iccv51701.2025.01094","title":"GENMO: A GENeralist Model for Human MOtion","display_name":"GENMO: A GENeralist Model for Human MOtion","publication_year":2025,"publication_date":"2025-10-19","ids":{"openalex":"https://openalex.org/W4414771487","doi":"https://doi.org/10.1109/iccv51701.2025.01094"},"language":"en","primary_location":{"id":"doi:10.1109/iccv51701.2025.01094","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iccv51701.2025.01094","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/CVF International Conference on Computer Vision (ICCV)","raw_type":"proceedings-article"},"type":"article","indexed_in":["arxiv","crossref","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2505.01425","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5078550258","display_name":"Jiefeng Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiefeng Li","raw_affiliation_strings":["NVIDIA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5068413732","display_name":"Jinkun Cao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jinkun Cao","raw_affiliation_strings":["NVIDIA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5029774691","display_name":"Haotian Zhang","orcid":"https://orcid.org/0000-0003-4936-1539"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Haotian Zhang","raw_affiliation_strings":["NVIDIA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5078529189","display_name":"Davis Rempe","orcid":"https://orcid.org/0000-0003-2256-5507"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Davis Rempe","raw_affiliation_strings":["NVIDIA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5056503617","display_name":"Jan Kautz","orcid":"https://orcid.org/0000-0002-8830-429X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jan Kautz","raw_affiliation_strings":["NVIDIA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101862728","display_name":"Umar Iqbal","orcid":"https://orcid.org/0000-0001-6074-1693"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Umar Iqbal","raw_affiliation_strings":["NVIDIA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100337347","display_name":"Ye Yuan","orcid":"https://orcid.org/0000-0003-2988-5001"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ye Yuan","raw_affiliation_strings":["NVIDIA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":1.7588,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.88818802,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":null,"issue":null,"first_page":"11766","last_page":"11776"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.3472000062465668,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.3472000062465668,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/motion","display_name":"Motion (physics)","score":0.7562999725341797},{"id":"https://openalex.org/keywords/motion-estimation","display_name":"Motion estimation","score":0.6089000105857849},{"id":"https://openalex.org/keywords/motion-capture","display_name":"Motion capture","score":0.4918000102043152},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.4828999936580658},{"id":"https://openalex.org/keywords/generalist-and-specialist-species","display_name":"Generalist and specialist species","score":0.4708999991416931},{"id":"https://openalex.org/keywords/structure-from-motion","display_name":"Structure from motion","score":0.45739999413490295},{"id":"https://openalex.org/keywords/prior-probability","display_name":"Prior probability","score":0.4530999958515167},{"id":"https://openalex.org/keywords/generative-model","display_name":"Generative model","score":0.4300000071525574},{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.39410001039505005}],"concepts":[{"id":"https://openalex.org/C104114177","wikidata":"https://www.wikidata.org/wiki/Q79782","display_name":"Motion (physics)","level":2,"score":0.7562999725341797},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7200999855995178},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.667900025844574},{"id":"https://openalex.org/C10161872","wikidata":"https://www.wikidata.org/wiki/Q557891","display_name":"Motion estimation","level":2,"score":0.6089000105857849},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.49410000443458557},{"id":"https://openalex.org/C48007421","wikidata":"https://www.wikidata.org/wiki/Q676252","display_name":"Motion capture","level":3,"score":0.4918000102043152},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.48660001158714294},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.4828999936580658},{"id":"https://openalex.org/C45371612","wikidata":"https://www.wikidata.org/wiki/Q3058587","display_name":"Generalist and specialist species","level":3,"score":0.4708999991416931},{"id":"https://openalex.org/C146159030","wikidata":"https://www.wikidata.org/wiki/Q7625099","display_name":"Structure from motion","level":3,"score":0.45739999413490295},{"id":"https://openalex.org/C177769412","wikidata":"https://www.wikidata.org/wiki/Q278090","display_name":"Prior probability","level":3,"score":0.4530999958515167},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.4300000071525574},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.39410001039505005},{"id":"https://openalex.org/C124774092","wikidata":"https://www.wikidata.org/wiki/Q6917782","display_name":"Motion field","level":3,"score":0.3824999928474426},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.36570000648498535},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.3596000075340271},{"id":"https://openalex.org/C2986578859","wikidata":"https://www.wikidata.org/wiki/Q657632","display_name":"Human motion","level":3,"score":0.3343000113964081},{"id":"https://openalex.org/C145912823","wikidata":"https://www.wikidata.org/wiki/Q113558","display_name":"Dynamics (music)","level":2,"score":0.30379998683929443},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.2953000068664551},{"id":"https://openalex.org/C2777036941","wikidata":"https://www.wikidata.org/wiki/Q6917771","display_name":"Motion analysis","level":2,"score":0.29179999232292175},{"id":"https://openalex.org/C2780440489","wikidata":"https://www.wikidata.org/wiki/Q5227278","display_name":"Data-driven","level":2,"score":0.2824000120162964},{"id":"https://openalex.org/C96250715","wikidata":"https://www.wikidata.org/wiki/Q965330","display_name":"Estimation","level":2,"score":0.27639999985694885},{"id":"https://openalex.org/C160920958","wikidata":"https://www.wikidata.org/wiki/Q7662746","display_name":"Synthetic data","level":2,"score":0.27480000257492065},{"id":"https://openalex.org/C116834253","wikidata":"https://www.wikidata.org/wiki/Q2039217","display_name":"Identification (biology)","level":2,"score":0.2621000111103058},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.2515000104904175}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1109/iccv51701.2025.01094","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iccv51701.2025.01094","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/CVF International Conference on Computer Vision (ICCV)","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2505.01425","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2505.01425","pdf_url":"https://arxiv.org/pdf/2505.01425","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2505.01425","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2505.01425","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2505.01425","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2505.01425","pdf_url":"https://arxiv.org/pdf/2505.01425","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Human":[0,74],"motion":[1,5,32,39,78,92,96,101,119,124,205],"modeling":[2],"traditionally":[3],"separates":[4],"generation":[6,16,81,189],"and":[7,52,61,80,113,140,154],"estimation":[8,33,79,93,120],"into":[9],"distinct":[10],"tasks":[11,60,206],"with":[12,137],"specialized":[13],"models.":[14,65],"Motion":[15,75],"models":[17,34],"focus":[18],"on":[19],"creating":[20],"diverse,":[21],"realistic":[22],"motions":[23,153,178],"from":[24,41],"inputs":[25],"like":[26,43,182],"text,":[27],"audio,":[28,159],"or":[29],"keyframes,":[30],"while":[31,121,184],"aim":[35],"to":[36,90,143],"reconstruct":[37],"accurate":[38,117],"trajectories":[40],"observations":[42],"videos.":[44],"Despite":[45],"sharing":[46],"underlying":[47],"representations":[48],"of":[49],"temporal":[50],"dynamics":[51],"kinematics,":[53],"this":[54],"separation":[55],"limits":[56],"knowledge":[57],"transfer":[58],"between":[59,111],"requires":[62],"maintaining":[63],"separate":[64],"We":[66,126],"present":[67],"GENMO,":[68],"a":[69,83,197,208],"unified":[70,169],"Generalist":[71],"Model":[72],"for":[73],"that":[76,133,200],"bridges":[77],"in":[82],"single":[84,209],"framework.":[85],"Our":[86],"key":[87],"insight":[88],"is":[89],"reformulate":[91],"as":[94,196],"constrained":[95],"generation,":[97],"where":[98],"the":[99,109],"output":[100],"must":[102],"precisely":[103],"satisfy":[104],"observed":[105],"conditioning":[106],"signals.":[107],"Leveraging":[108],"synergy":[110],"regression":[112],"diffusion,":[114],"GENMO":[115],"achieves":[116],"global":[118],"enabling":[122],"diverse":[123,185],"generation.":[125],"also":[127],"introduce":[128],"an":[129],"estimation-guided":[130],"training":[131],"objective":[132],"exploits":[134],"in-the-wild":[135],"videos":[136],"2D":[138],"annotations":[139],"text":[141],"descriptions":[142],"enhance":[144],"generative":[145,174],"diversity.":[146],"Furthermore,":[147],"our":[148],"novel":[149],"architecture":[150],"handles":[151,202],"variable-length":[152],"mixed":[155],"multimodal":[156],"conditions":[157,181],"(text,":[158],"video)":[160],"at":[161],"different":[162],"time":[163],"intervals,":[164],"offering":[165],"flexible":[166],"control.":[167],"This":[168],"approach":[170],"creates":[171],"synergistic":[172],"benefits:":[173],"priors":[175],"improve":[176],"estimated":[177],"under":[179],"challenging":[180],"occlusions,":[183],"video":[186],"data":[187],"enhances":[188],"capabilities.":[190],"Extensive":[191],"experiments":[192],"demonstrate":[193],"GENMO's":[194],"effectiveness":[195],"generalist":[198],"framework":[199],"successfully":[201],"multiple":[203],"human":[204],"within":[207],"model.":[210]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
