{"id":"https://openalex.org/W7138396613","doi":"https://doi.org/10.48550/arxiv.2603.13391","title":"WebVR: Benchmarking Multimodal LLMs for WebPage Recreation from Videos via Human-Aligned Visual Rubrics","display_name":"WebVR: Benchmarking Multimodal LLMs for WebPage Recreation from Videos via Human-Aligned Visual Rubrics","publication_year":2026,"publication_date":"2026-03-11","ids":{"openalex":"https://openalex.org/W7138396613","doi":"https://doi.org/10.48550/arxiv.2603.13391"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.13391","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.13391","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.13391","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5129673664","display_name":"Yuhong Dai","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Dai, Yuhong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129643547","display_name":"Yanlin Lai","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lai, Yanlin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129739369","display_name":"Mitt Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Mitt","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129744799","display_name":"Hangyu Guo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guo, Hangyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5088816340","display_name":"Dingming Li","orcid":"https://orcid.org/0000-0001-9695-7496"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Dingming","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129743223","display_name":"Hongbo Peng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Peng, Hongbo","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129736730","display_name":"Haodong Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Haodong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129675669","display_name":"Yingxiu Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Yingxiu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5077368865","display_name":"Haoran Lyu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lyu, Haoran","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129734586","display_name":"Zheng Ge","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ge, Zheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129665355","display_name":"Xiangyu Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Xiangyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5129710784","display_name":"Daxin Jiang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiang, Daxin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":12,"corresponding_author_ids":["https://openalex.org/A5129673664"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.3407000005245209,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.3407000005245209,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10789","display_name":"Interactive and Immersive Displays","score":0.11330000311136246,"subfield":{"id":"https://openalex.org/subfields/1709","display_name":"Human-Computer Interaction"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.07370000332593918,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/web-page","display_name":"Web page","score":0.8033000230789185},{"id":"https://openalex.org/keywords/rubric","display_name":"Rubric","score":0.7815999984741211},{"id":"https://openalex.org/keywords/benchmarking","display_name":"Benchmarking","score":0.6916000247001648},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.6484000086784363},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.5630999803543091},{"id":"https://openalex.org/keywords/the-internet","display_name":"The Internet","score":0.45570001006126404},{"id":"https://openalex.org/keywords/web-design","display_name":"Web design","score":0.4496000111103058},{"id":"https://openalex.org/keywords/motion","display_name":"Motion (physics)","score":0.40950000286102295}],"concepts":[{"id":"https://openalex.org/C21959979","wikidata":"https://www.wikidata.org/wiki/Q36774","display_name":"Web page","level":2,"score":0.8033000230789185},{"id":"https://openalex.org/C111640148","wikidata":"https://www.wikidata.org/wiki/Q847349","display_name":"Rubric","level":2,"score":0.7815999984741211},{"id":"https://openalex.org/C86251818","wikidata":"https://www.wikidata.org/wiki/Q816754","display_name":"Benchmarking","level":2,"score":0.6916000247001648},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.6484000086784363},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6470000147819519},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.6159999966621399},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.5630999803543091},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.4796999990940094},{"id":"https://openalex.org/C110875604","wikidata":"https://www.wikidata.org/wiki/Q75","display_name":"The Internet","level":2,"score":0.45570001006126404},{"id":"https://openalex.org/C521306242","wikidata":"https://www.wikidata.org/wiki/Q190637","display_name":"Web design","level":3,"score":0.4496000111103058},{"id":"https://openalex.org/C104114177","wikidata":"https://www.wikidata.org/wiki/Q79782","display_name":"Motion (physics)","level":2,"score":0.40950000286102295},{"id":"https://openalex.org/C189430467","wikidata":"https://www.wikidata.org/wiki/Q7293293","display_name":"Ranking (information retrieval)","level":2,"score":0.3885999917984009},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.37279999256134033},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.3560999929904938},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.34119999408721924},{"id":"https://openalex.org/C188985296","wikidata":"https://www.wikidata.org/wiki/Q868954","display_name":"Page layout","level":2,"score":0.3310999870300293},{"id":"https://openalex.org/C89159866","wikidata":"https://www.wikidata.org/wiki/Q4119753","display_name":"Style sheet","level":3,"score":0.3179999887943268},{"id":"https://openalex.org/C62230096","wikidata":"https://www.wikidata.org/wiki/Q275969","display_name":"Crowdsourcing","level":2,"score":0.31439998745918274},{"id":"https://openalex.org/C67617509","wikidata":"https://www.wikidata.org/wiki/Q1503327","display_name":"Site map","level":5,"score":0.3050000071525574},{"id":"https://openalex.org/C79373723","wikidata":"https://www.wikidata.org/wiki/Q386275","display_name":"Web development","level":3,"score":0.3046000003814697},{"id":"https://openalex.org/C182321512","wikidata":"https://www.wikidata.org/wiki/Q1153289","display_name":"Web standards","level":3,"score":0.27570000290870667},{"id":"https://openalex.org/C118643609","wikidata":"https://www.wikidata.org/wiki/Q189210","display_name":"Web application","level":2,"score":0.27090001106262207},{"id":"https://openalex.org/C110269972","wikidata":"https://www.wikidata.org/wiki/Q184872","display_name":"Recreation","level":2,"score":0.26350000500679016},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.25999999046325684},{"id":"https://openalex.org/C171686336","wikidata":"https://www.wikidata.org/wiki/Q3532085","display_name":"Topic model","level":2,"score":0.2535000145435333}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.13391","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.13391","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.13391","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.13391","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Existing":[0],"web-generation":[1],"benchmarks":[2],"rely":[3],"on":[4,116,153],"text":[5],"prompts":[6],"or":[7],"static":[8],"screenshots":[9],"as":[10,19],"input.":[11],"However,":[12],"videos":[13],"naturally":[14],"convey":[15],"richer":[16],"signals":[17],"such":[18],"interaction":[20],"flow,":[21],"transition":[22],"timing,":[23],"and":[24,90,126,146],"motion":[25,127],"continuity,":[26],"which":[27],"are":[28],"essential":[29],"for":[30,47],"faithful":[31],"webpage":[32,38],"recreation.":[33],"Despite":[34],"this":[35,48,52],"potential,":[36],"video-conditioned":[37],"generation":[39],"remains":[40],"largely":[41],"unexplored,":[42],"with":[43,95,137],"no":[44],"dedicated":[45],"benchmark":[46,58],"task.":[49],"To":[50],"fill":[51],"gap,":[53],"we":[54],"introduce":[55],"WebVR,":[56],"a":[57,80,102],"that":[59,107],"evaluates":[60,108],"whether":[61],"MLLMs":[62],"can":[63],"faithfully":[64],"recreate":[65],"webpages":[66,73,111],"from":[67],"demonstration":[68],"videos.":[69],"WebVR":[70],"contains":[71],"175":[72],"across":[74,112],"diverse":[75],"categories,":[76],"all":[77],"constructed":[78],"through":[79],"controlled":[81],"synthesis":[82],"pipeline":[83],"rather":[84],"than":[85],"web":[86],"crawling,":[87],"ensuring":[88],"varied":[89],"realistic":[91],"demonstrations":[92],"without":[93],"overlap":[94],"existing":[96],"online":[97],"pages.":[98],"We":[99,140],"also":[100],"design":[101],"fine-grained,":[103],"human-aligned":[104],"visual":[105],"rubric":[106],"the":[109,130,142],"generated":[110],"multiple":[113],"dimensions.":[114],"Experiments":[115],"19":[117],"models":[118],"reveal":[119],"substantial":[120],"gaps":[121],"in":[122],"recreating":[123],"fine-grained":[124],"style":[125],"quality,":[128],"while":[129],"rubric-based":[131],"automatic":[132],"evaluation":[133,144],"achieves":[134],"96%":[135],"agreement":[136],"human":[138],"preferences.":[139],"release":[141],"dataset,":[143],"toolkit,":[145],"baseline":[147],"results":[148],"to":[149],"support":[150],"future":[151],"research":[152],"video-to-webpage":[154],"generation.":[155]},"counts_by_year":[],"updated_date":"2026-05-05T08:41:31.759640","created_date":"2026-03-18T00:00:00"}
