{"id":"https://openalex.org/W4415594369","doi":"https://doi.org/10.1109/jstsp.2025.3626259","title":"SGNet: Sequence Grouping Network via Vision-Language Model for Text-Guided Video Summarization","display_name":"SGNet: Sequence Grouping Network via Vision-Language Model for Text-Guided Video Summarization","publication_year":2025,"publication_date":"2025-10-01","ids":{"openalex":"https://openalex.org/W4415594369","doi":"https://doi.org/10.1109/jstsp.2025.3626259"},"language":null,"primary_location":{"id":"doi:10.1109/jstsp.2025.3626259","is_oa":false,"landing_page_url":"https://doi.org/10.1109/jstsp.2025.3626259","pdf_url":null,"source":{"id":"https://openalex.org/S42167783","display_name":"IEEE Journal of Selected Topics in Signal Processing","issn_l":"1932-4553","issn":["1932-4553","1941-0484"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Journal of Selected Topics in Signal Processing","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5042407897","display_name":"Jiacheng Yao","orcid":"https://orcid.org/0009-0009-5271-0454"},"institutions":[{"id":"https://openalex.org/I37796252","display_name":"Beijing University of Technology","ror":"https://ror.org/037b1pp87","country_code":"CN","type":"education","lineage":["https://openalex.org/I37796252"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Jiacheng Yao","raw_affiliation_strings":["School of Information Science and Technology, Beijing University of Technology, Beijing, China"],"affiliations":[{"raw_affiliation_string":"School of Information Science and Technology, Beijing University of Technology, Beijing, China","institution_ids":["https://openalex.org/I37796252"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5014482509","display_name":"Jing Zhang","orcid":"https://orcid.org/0000-0003-2541-4923"},"institutions":[{"id":"https://openalex.org/I37796252","display_name":"Beijing University of Technology","ror":"https://ror.org/037b1pp87","country_code":"CN","type":"education","lineage":["https://openalex.org/I37796252"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jing Zhang","raw_affiliation_strings":["School of Information Science and Technology, Beijing University of Technology, Beijing, China"],"affiliations":[{"raw_affiliation_string":"School of Information Science and Technology, Beijing University of Technology, Beijing, China","institution_ids":["https://openalex.org/I37796252"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100448036","display_name":"Zhuo Li","orcid":"https://orcid.org/0000-0002-9937-2669"},"institutions":[{"id":"https://openalex.org/I37796252","display_name":"Beijing University of Technology","ror":"https://ror.org/037b1pp87","country_code":"CN","type":"education","lineage":["https://openalex.org/I37796252"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Li Zhuo","raw_affiliation_strings":["School of Information Science and Technology, Beijing University of Technology, Beijing, China"],"affiliations":[{"raw_affiliation_string":"School of Information Science and Technology, Beijing University of Technology, Beijing, China","institution_ids":["https://openalex.org/I37796252"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5042407897"],"corresponding_institution_ids":["https://openalex.org/I37796252"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.32965896,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"19","issue":"7","first_page":"1236","last_page":"1250"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9825000166893005,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9825000166893005,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9764000177383423,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10824","display_name":"Image Retrieval and Classification Techniques","score":0.9466999769210815,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/automatic-summarization","display_name":"Automatic summarization","score":0.902999997138977},{"id":"https://openalex.org/keywords/bridging","display_name":"Bridging (networking)","score":0.6481000185012817},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.5989999771118164},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.5674999952316284},{"id":"https://openalex.org/keywords/grasp","display_name":"GRASP","score":0.5040000081062317},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.4514000117778778},{"id":"https://openalex.org/keywords/feature-extraction","display_name":"Feature extraction","score":0.44589999318122864},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.4235000014305115},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.40959998965263367},{"id":"https://openalex.org/keywords/sequence","display_name":"Sequence (biology)","score":0.3898000121116638}],"concepts":[{"id":"https://openalex.org/C170858558","wikidata":"https://www.wikidata.org/wiki/Q1394144","display_name":"Automatic summarization","level":2,"score":0.902999997138977},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8824999928474426},{"id":"https://openalex.org/C174348530","wikidata":"https://www.wikidata.org/wiki/Q188635","display_name":"Bridging (networking)","level":2,"score":0.6481000185012817},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6097000241279602},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.5989999771118164},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.5674999952316284},{"id":"https://openalex.org/C171268870","wikidata":"https://www.wikidata.org/wiki/Q1486676","display_name":"GRASP","level":2,"score":0.5040000081062317},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.4514000117778778},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.44589999318122864},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.42739999294281006},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.4235000014305115},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.40959998965263367},{"id":"https://openalex.org/C2778112365","wikidata":"https://www.wikidata.org/wiki/Q3511065","display_name":"Sequence (biology)","level":2,"score":0.3898000121116638},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.36959999799728394},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.3621000051498413},{"id":"https://openalex.org/C195807954","wikidata":"https://www.wikidata.org/wiki/Q1662562","display_name":"Information extraction","level":2,"score":0.34459999203681946},{"id":"https://openalex.org/C19768560","wikidata":"https://www.wikidata.org/wiki/Q320727","display_name":"Dependency (UML)","level":2,"score":0.3424000144004822},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.33079999685287476},{"id":"https://openalex.org/C35639132","wikidata":"https://www.wikidata.org/wiki/Q7452468","display_name":"Sequence labeling","level":3,"score":0.32989999651908875},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.3255999982357025},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.31119999289512634},{"id":"https://openalex.org/C2779439875","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Natural language understanding","level":3,"score":0.30880001187324524},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3068999946117401},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.2777999937534332},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.27410000562667847},{"id":"https://openalex.org/C150899416","wikidata":"https://www.wikidata.org/wiki/Q1820378","display_name":"Transfer of learning","level":2,"score":0.2669999897480011},{"id":"https://openalex.org/C33676613","wikidata":"https://www.wikidata.org/wiki/Q13415176","display_name":"Dimension (graph theory)","level":2,"score":0.26649999618530273},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.26589998602867126},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.265500009059906},{"id":"https://openalex.org/C172367668","wikidata":"https://www.wikidata.org/wiki/Q6504956","display_name":"Data visualization","level":3,"score":0.26339998841285706},{"id":"https://openalex.org/C186644900","wikidata":"https://www.wikidata.org/wiki/Q194152","display_name":"Parsing","level":2,"score":0.2605000138282776},{"id":"https://openalex.org/C130318100","wikidata":"https://www.wikidata.org/wiki/Q2268914","display_name":"Semantic similarity","level":2,"score":0.2524999976158142}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/jstsp.2025.3626259","is_oa":false,"landing_page_url":"https://doi.org/10.1109/jstsp.2025.3626259","pdf_url":null,"source":{"id":"https://openalex.org/S42167783","display_name":"IEEE Journal of Selected Topics in Signal Processing","issn_l":"1932-4553","issn":["1932-4553","1941-0484"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Journal of Selected Topics in Signal Processing","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":64,"referenced_works":["https://openalex.org/W1924343884","https://openalex.org/W1965555277","https://openalex.org/W2036708442","https://openalex.org/W2106229755","https://openalex.org/W2120703069","https://openalex.org/W2302255633","https://openalex.org/W2529272619","https://openalex.org/W2549139847","https://openalex.org/W2737677090","https://openalex.org/W2781922022","https://openalex.org/W2928165649","https://openalex.org/W2963919999","https://openalex.org/W2964158702","https://openalex.org/W2967038491","https://openalex.org/W2979656080","https://openalex.org/W2982672255","https://openalex.org/W2989322838","https://openalex.org/W3015830103","https://openalex.org/W3017004831","https://openalex.org/W3025569967","https://openalex.org/W3025796084","https://openalex.org/W3034368386","https://openalex.org/W3080935714","https://openalex.org/W3083405884","https://openalex.org/W3094378471","https://openalex.org/W3097646105","https://openalex.org/W3107252718","https://openalex.org/W3126721948","https://openalex.org/W3133226919","https://openalex.org/W3162474807","https://openalex.org/W3177196641","https://openalex.org/W3190537575","https://openalex.org/W3197682865","https://openalex.org/W3201625964","https://openalex.org/W3210279979","https://openalex.org/W4214612132","https://openalex.org/W4285093674","https://openalex.org/W4310553896","https://openalex.org/W4311424438","https://openalex.org/W4312560592","https://openalex.org/W4312632488","https://openalex.org/W4313325468","https://openalex.org/W4319300174","https://openalex.org/W4361756415","https://openalex.org/W4378675345","https://openalex.org/W4379805693","https://openalex.org/W4380877627","https://openalex.org/W4385210901","https://openalex.org/W4385245566","https://openalex.org/W4386075892","https://openalex.org/W4386257881","https://openalex.org/W4387385613","https://openalex.org/W4387969127","https://openalex.org/W4388691863","https://openalex.org/W4388739184","https://openalex.org/W4389161308","https://openalex.org/W4390905706","https://openalex.org/W4392215335","https://openalex.org/W4392826163","https://openalex.org/W4392904279","https://openalex.org/W4393147243","https://openalex.org/W4394842465","https://openalex.org/W4399710004","https://openalex.org/W4402753830"],"related_works":[],"abstract_inverted_index":{"Video":[0],"summarization":[1],"can":[2],"condense":[3],"the":[4,19,41,60,146,157,178,188,193,202,206,216,222,227,230,236],"key":[5],"information":[6,200,220],"in":[7,54,187],"a":[8,11,26,111,129,163],"video":[9,103,122,184],"into":[10,201,221],"concise":[12],"format":[13],"to":[14,29,119,138,167,191,197,213,225],"help":[15],"viewers":[16],"quickly":[17],"grasp":[18],"core":[20],"content.":[21],"Existing":[22],"approaches":[23],"typically":[24],"use":[25],"single":[27],"model":[28,168],"generate":[30,120,172,226],"uniform":[31],"summaries,":[32],"ignoring":[33],"diverse":[34],"subjective":[35],"human":[36,80,107],"cognition":[37],"and":[38,70,94,99,133,160,171,204,218,251,259],"further":[39],"widening":[40],"semantic":[42,62],"gap.":[43],"Recently,":[44],"multimodal":[45],"large":[46,66],"language":[47,67],"models":[48,68,89],"(MLLMs)":[49],"have":[50],"shown":[51],"great":[52],"promise":[53],"bridging":[55],"this":[56],"gap":[57],"by":[58,71,106],"leveraging":[59],"advanced":[61],"analysis":[63],"capabilities":[64],"of":[65,86,97,151,229,255,266],"(LLMs)":[69],"generating":[72],"high-level":[73,126],"concepts":[74],"that":[75,235],"are":[76,154,181],"closely":[77],"aligned":[78,124],"with":[79,125,131,142,262],"cognition.":[81],"As":[82],"an":[83,263],"important":[84],"subset":[85],"MLLMs,":[87],"vision-language":[88,207,223],"(VLMs)":[90],"focus":[91],"on":[92,242],"alignment":[93],"semantics":[95],"extraction":[96],"visual":[98,217],"textual":[100,199,219],"modalities":[101],"for":[102],"summarization.":[104],"Inspired":[105],"cognition,":[108],"we":[109],"propose":[110],"sequence":[112,164],"grouping":[113],"network":[114],"(SGNet)":[115],"via":[116],"VLM":[117,189],"pipeline":[118,190],"text-guided":[121],"summaries":[123],"concepts.":[127],"Firstly,":[128],"backbone":[130],"low-rank":[132],"sparsity":[134],"constraints":[135],"is":[136,211],"utilized":[137],"extract":[139],"frame-level":[140,152],"features":[141,153,180],"high":[143],"value":[144],"from":[145,183],"spatial":[147],"dimension.":[148],"Then,":[149],"sequences":[150],"grouped":[155],"along":[156],"temporal":[158,175],"dimension":[159],"processed":[161],"using":[162],"Transformer-in-Transformer":[165],"(S-TNT)":[166],"inter-frame":[169],"correlations":[170],"compact":[173],"spatio":[174],"representations.":[176],"Finally,":[177],"text":[179,185],"obtained":[182],"description":[186],"guide":[192],"cross-modal":[194],"attention":[195],"mechanism":[196],"embed":[198],"S-TNT,":[203],"then":[205],"knowledge":[208],"transfer":[209],"(VL-KT)":[210],"used":[212],"seamlessly":[214],"integrate":[215],"feature":[224],"summary":[228],"video.":[231],"Empirical":[232],"evaluations":[233],"demonstrate":[234],"proposed":[237],"SGNet":[238],"achieves":[239],"state-of-the-art":[240],"performance":[241],"four":[243],"publicly":[244],"available":[245],"datasets":[246],"(TvSum,":[247],"SumMe,":[248],"UT":[249],"Ego,":[250],"VaTeX),":[252],"attaining":[253],"F1-Scores":[254],"67.57%,":[256],"55.80%,":[257],"62.80%,":[258],"52.46%,":[260],"respectively,":[261],"inference":[264],"speed":[265],"5.84":[267],"FPS.":[268]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-10-28T00:00:00"}
