{"id":"https://openalex.org/W2901965591","doi":"https://doi.org/10.13016/m2fx7423n","title":"Detecting Objects and Actions with Deep Learning","display_name":"Detecting Objects and Actions with Deep Learning","publication_year":2018,"publication_date":"2018-01-01","ids":{"openalex":"https://openalex.org/W2901965591","doi":"https://doi.org/10.13016/m2fx7423n","mag":"2901965591"},"language":"en","primary_location":{"id":"pmh:oai:drum.lib.umd.edu:1903/21149","is_oa":true,"landing_page_url":"https://doi.org/10.13016/M2FX7423N","pdf_url":"http://hdl.handle.net/1903/21149","source":{"id":"https://openalex.org/S4306402644","display_name":"Digital Repository at the University of Maryland (University of Maryland College Park)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I66946132","host_organization_name":"University of Maryland, College Park","host_organization_lineage":["https://openalex.org/I66946132"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Dissertation"},"type":"dissertation","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"http://hdl.handle.net/1903/21149","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5008541631","display_name":"Bharat Singh","orcid":"https://orcid.org/0000-0001-6178-1718"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Singh, Bharat","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":1,"corresponding_author_ids":["https://openalex.org/A5008541631"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11512","display_name":"Anomaly Detection Techniques and Applications","score":0.8633999824523926,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11512","display_name":"Anomaly Detection Techniques and Applications","score":0.8633999824523926,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T14339","display_name":"Image Processing and 3D Reconstruction","score":0.7954999804496765,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.7429999709129333,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.5083958506584167},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.48813924193382263},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.44159233570098877},{"id":"https://openalex.org/keywords/data-science","display_name":"Data science","score":0.35470372438430786},{"id":"https://openalex.org/keywords/cognitive-science","display_name":"Cognitive science","score":0.33555155992507935},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.288110613822937}],"concepts":[{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.5083958506584167},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.48813924193382263},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.44159233570098877},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.35470372438430786},{"id":"https://openalex.org/C188147891","wikidata":"https://www.wikidata.org/wiki/Q147638","display_name":"Cognitive science","level":1,"score":0.33555155992507935},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.288110613822937}],"mesh":[],"locations_count":3,"locations":[{"id":"pmh:oai:drum.lib.umd.edu:1903/21149","is_oa":true,"landing_page_url":"https://doi.org/10.13016/M2FX7423N","pdf_url":"http://hdl.handle.net/1903/21149","source":{"id":"https://openalex.org/S4306402644","display_name":"Digital Repository at the University of Maryland (University of Maryland College Park)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I66946132","host_organization_name":"University of Maryland, College Park","host_organization_lineage":["https://openalex.org/I66946132"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Dissertation"},{"id":"mag:2901965591","is_oa":false,"landing_page_url":"https://drum.lib.umd.edu/handle/1903/21149","pdf_url":null,"source":null,"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":null},{"id":"doi:10.13016/m2fx7423n","is_oa":true,"landing_page_url":"https://doi.org/10.13016/m2fx7423n","pdf_url":null,"source":{"id":"https://openalex.org/S4306402644","display_name":"Digital Repository at the University of Maryland (University of Maryland College Park)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I66946132","host_organization_name":"University of Maryland, College Park","host_organization_lineage":["https://openalex.org/I66946132"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"thesis"}],"best_oa_location":{"id":"pmh:oai:drum.lib.umd.edu:1903/21149","is_oa":true,"landing_page_url":"https://doi.org/10.13016/M2FX7423N","pdf_url":"http://hdl.handle.net/1903/21149","source":{"id":"https://openalex.org/S4306402644","display_name":"Digital Repository at the University of Maryland (University of Maryland College Park)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I66946132","host_organization_name":"University of Maryland, College Park","host_organization_lineage":["https://openalex.org/I66946132"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Dissertation"},"sustainable_development_goals":[{"score":0.5600000023841858,"id":"https://metadata.un.org/sdg/10","display_name":"Reduced inequalities"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W2901965591.pdf"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W1514670560","https://openalex.org/W1580852037","https://openalex.org/W2586393454","https://openalex.org/W2903684130","https://openalex.org/W2183687355","https://openalex.org/W2806377827","https://openalex.org/W2313832241","https://openalex.org/W2482524209","https://openalex.org/W2321499050","https://openalex.org/W2279510073","https://openalex.org/W1881520952","https://openalex.org/W2547982246","https://openalex.org/W2976288521","https://openalex.org/W43898171","https://openalex.org/W2288664515","https://openalex.org/W2498263802","https://openalex.org/W2034240581","https://openalex.org/W2590503690","https://openalex.org/W3157374498","https://openalex.org/W1568067809"],"abstract_inverted_index":{"Deep":[0],"learning":[1,265,443],"based":[2,279,444],"visual":[3,25,42,221,322],"recognition":[4,43,222,290,323],"and":[5,15,50,94,108,127,138,147,302,346,380,454,496,516,534,570],"localization":[6,450],"is":[7,16,134,311,397,503,540],"one":[8],"of":[9,12,61,67,81,105,122,150,173,176,182,190,225,270,287,341,387,406,492,561],"the":[10,17,82,117,171,183,243,268,281,297,305,352,360,364,371,381,385,393,440,486,506,520,537,562],"pillars":[11],"computer":[13],"vision":[14],"driving":[18],"force":[19],"behind":[20],"applications":[21],"like":[22,76,292,472],"self-driving":[23],"cars,":[24],"search,":[26],"video":[27,563],"surveillance,":[28],"augmented":[29],"reality,":[30],"to":[31,54,113,143,296,313,377,399,480,519,565],"name":[32],"a":[33,72,157,180,257,263,332,401,429,477,482,490,543],"few.":[34],"This":[35,438],"thesis":[36],"identifies":[37],"key":[38],"bottlenecks":[39],"in":[40,96,229,451],"state-of-the-art":[41,572],"pipelines":[44],"which":[45,70,84,168,446,488],"use":[46],"convolutional":[47,62],"neural":[48,63,433],"networks":[49,64],"proposes":[51],"effective":[52,111],"solutions":[53,112],"push":[55],"their":[56,86],"limits.":[57],"A":[58],"few":[59],"shortcomings":[60],"are,":[65],"lack":[66],"scale":[68,132],"invariance":[69],"poses":[71],"challenge":[73],"for":[74,125,164,205,213,218,299,319,338,355,414,435,494],"tasks":[75,291,324,471],"object":[77,174,293,335,389,412],"detection,":[78],"fixed":[79],"structure":[80],"network":[83,260,434,556],"restricts":[85],"usage":[87],"when":[88],"presented":[89],"with":[90,208,262,363,384,418,460,525],"new":[91,415],"class":[92],"labels,":[93],"difficulty":[95],"modeling":[97],"long":[98,452],"range":[99],"spatial/temporal":[100],"dependencies.":[101],"We":[102,368,509,550],"provide":[103],"evidence":[104],"these":[106,249],"problems":[107],"then":[109],"design":[110],"overcome":[114],"them.":[115],"In":[116],"first":[118,441],"part,":[119],"an":[120,151,211,230,356,567],"analysis":[121],"different":[123,177],"techniques":[124],"recognizing":[126],"detecting":[128,339],"objects":[129,140,191],"under":[130],"extreme":[131,193],"variation":[133],"presented.":[135],"Since":[136],"small":[137],"large":[139],"are":[141,251,348],"difficult":[142],"recognize":[144],"at":[145,192,242],"smaller":[146],"larger":[148],"scales":[149],"image":[152,184,231,274,300],"pyramid":[153],"respectively,":[154],"we":[155,200,330,358,409,427],"present":[156,331,428],"novel":[158,378],"training":[159,217,276,286,388],"scheme":[160],"called":[161],"Scale":[162],"Normalization":[163,204],"Image":[165,206],"Pyramids":[166,207],"(SNIP)":[167],"selectively":[169],"back-propagates":[170],"gradients":[172,189],"instances":[175,241],"sizes":[178],"as":[179],"function":[181],"scale.":[185,245],"As":[186],"SNIP":[187],"ignores":[188],"resolutions,":[194],"following":[195],"up":[196],"on":[197,280,315,514,574],"this":[198],"idea,":[199],"developed":[201],"SNIPER":[202,233,284],"(Scale":[203],"Efficient":[209],"Re-sampling),":[210],"algorithm":[212,479],"performing":[214,469],"efficient":[215],"multi-scale":[216],"instance":[219,288,320],"level":[220,289,321],"tasks.":[223],"Instead":[224],"processing":[226],"every":[227],"pixel":[228],"pyramid,":[232],"processes":[234],"context":[235],"regions":[236],"(512x512":[237],"pixels)":[238],"around":[239,485],"ground-truth":[240],"appropriate":[244],"For":[246],"background":[247,500],"sampling,":[248],"context-regions":[250],"generated":[252,272],"using":[253],"proposals":[254],"extracted":[255],"from":[256],"region":[258],"proposal":[259],"trained":[261],"short":[264],"schedule.":[266],"Hence,":[267],"number":[269,386],"chips":[271],"per":[273],"during":[275],"adaptively":[277],"changes":[278],"scene":[282],"complexity.":[283],"brings":[285],"detection":[294,345,353],"closer":[295],"protocol":[298],"classification":[301,347,366,419],"suggests":[303],"that":[304,309,370,395,502,552],"commonly":[306],"accepted":[307],"guideline":[308],"it":[310,396,455,458],"important":[312],"train":[314,411,510],"high":[316],"resolution":[317],"images":[318],"might":[325],"not":[326,504],"be":[327],"correct.":[328],"Next,":[329],"real-time":[333],"large-scale":[334],"detector":[336],"(R-FCN-3000)":[337],"thousands":[340],"classes":[342,379,390],"where":[343],"objectness":[344,361,372,403],"decoupled.":[349],"To":[350,528],"obtain":[351],"score":[354,362],"RoI,":[357],"multiply":[359],"fine-grained":[365],"score.":[367],"show":[369,551],"learned":[373],"by":[374,542],"R-FCN-3000":[375],"generalizes":[376],"performance":[382],"increases":[383],"-":[391],"supporting":[392],"hypothesis":[394],"possible":[398],"learn":[400],"universal":[402],"detector.":[404],"Because":[405],"generalized":[407],"objectness,":[408],"can":[410],"detectors":[413],"classes,":[416],"just":[417,459],"data,":[420,462],"without":[421,463],"even":[422],"requiring":[423,464],"bounding":[424,483,507,522],"boxes.":[425],"Finally,":[426],"multi-stream":[430,538],"bi-directional":[431,544,554],"recurrent":[432],"action":[436,449,568],"detection.":[437],"was":[439],"deep":[442],"system":[445,475],"could":[447,456],"perform":[448],"videos":[453],"do":[457],"RGB":[461],"any":[465],"skeletal":[466],"models":[467],"or":[468],"intermediate":[470],"pose-estimation.":[473],"Our":[474],"uses":[476],"tracking":[478],"locate":[481],"box":[484],"person,":[487],"provides":[489],"frame":[491],"reference":[493],"appearance":[495,517],"motion":[497,515],"while":[498],"suppressing":[499],"noise":[501],"within":[505,533],"box.":[508],"two":[511],"additional":[512],"streams":[513],"cropped":[518],"tracked":[521],"box,":[523],"along":[524],"full-frame":[526],"streams.":[527],"model":[529],"long-term":[530],"temporal":[531],"dynamics":[532],"between":[535],"actions,":[536],"CNN":[539],"followed":[541],"Long":[545],"Short-Term":[546],"Memory":[547],"(LSTM)":[548],"layer.":[549],"our":[553],"LSTM":[555],"utilizes":[557],"about":[558],"8":[559],"seconds":[560],"sequence":[564],"predict":[566],"label":[569],"outperforms":[571],"methods":[573],"multiple":[575],"benchmarks.":[576]},"counts_by_year":[],"updated_date":"2026-03-10T16:38:18.471706","created_date":"2025-10-10T00:00:00"}
