Regularized Q-learning through Robust Averaging

dc.contributor.authorSchmitt-Förster, Peter
dc.contributor.authorSutter, Tobias
dc.date.accessioned2024-06-12T08:16:04Z
dc.date.available2024-06-12T08:16:04Z
dc.date.issued2024
dc.description.abstractWe propose a new Q-learning variant, called 2RA Q-learning, that addresses some weaknesses of existing Q-learning methods in a principled manner. One such weakness is an underlying estimation bias which cannot be controlled and often results in poor performance. We propose a distributionally robust estimator for the maximum expected value term, which allows us to precisely control the level of estimation bias introduced. The distributionally robust estimator admits a closed-form solution such that the proposed algorithm has a computational cost per iteration comparable to Watkins' Q-learning. For the tabular case, we show that 2RA Q-learning converges to the optimal policy and analyze its asymptotic mean-squared error. Lastly, we conduct numerical experiments for various settings, which corroborate our theoretical findings and indicate that 2RA Q-learning often performs better than existing methods.
dc.description.versionpublisheddeu
dc.identifier.arxiv2405.02201
dc.identifier.doi10.48550/arXiv.2405.02201
dc.identifier.urihttps://kops.uni-konstanz.de/handle/123456789/70107
dc.language.isodeu
dc.subject.ddc004
dc.titleRegularized Q-learning through Robust Averagingdeu
dc.typePREPRINT
dspace.entity.typePublication
kops.citation.bibtex
@unpublished{SchmittForster2024Regul-70107,
  year={2024},
  doi={10.48550/arXiv.2405.02201},
  title={Regularized Q-learning through Robust Averaging},
  author={Schmitt-Förster, Peter and Sutter, Tobias}
}
kops.citation.iso690SCHMITT-FÖRSTER, Peter, Tobias SUTTER, 2024. Regularized Q-learning through Robust Averagingdeu
kops.citation.iso690SCHMITT-FÖRSTER, Peter, Tobias SUTTER, 2024. Regularized Q-learning through Robust Averagingeng
kops.citation.rdf
<rdf:RDF
    xmlns:dcterms="http://purl.org/dc/terms/"
    xmlns:dc="http://purl.org/dc/elements/1.1/"
    xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
    xmlns:bibo="http://purl.org/ontology/bibo/"
    xmlns:dspace="http://digital-repositories.org/ontologies/dspace/0.1.0#"
    xmlns:foaf="http://xmlns.com/foaf/0.1/"
    xmlns:void="http://rdfs.org/ns/void#"
    xmlns:xsd="http://www.w3.org/2001/XMLSchema#" > 
  <rdf:Description rdf:about="https://kops.uni-konstanz.de/server/rdf/resource/123456789/70107">
    <dc:creator>Sutter, Tobias</dc:creator>
    <dcterms:title>Regularized Q-learning through Robust Averaging</dcterms:title>
    <dc:contributor>Sutter, Tobias</dc:contributor>
    <dcterms:issued>2024</dcterms:issued>
    <dc:contributor>Schmitt-Förster, Peter</dc:contributor>
    <dspace:isPartOfCollection rdf:resource="https://kops.uni-konstanz.de/server/rdf/resource/123456789/43615"/>
    <foaf:homepage rdf:resource="http://localhost:8080/"/>
    <dcterms:isPartOf rdf:resource="https://kops.uni-konstanz.de/server/rdf/resource/123456789/43615"/>
    <dc:date rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2024-06-12T08:16:04Z</dc:date>
    <dc:language>deu</dc:language>
    <dc:creator>Schmitt-Förster, Peter</dc:creator>
    <dcterms:abstract>We propose a new Q-learning variant, called 2RA Q-learning, that addresses some weaknesses of existing Q-learning methods in a principled manner. One such weakness is an underlying estimation bias which cannot be controlled and often results in poor performance. We propose a distributionally robust estimator for the maximum expected value term, which allows us to precisely control the level of estimation bias introduced. The distributionally robust estimator admits a closed-form solution such that the proposed algorithm has a computational cost per iteration comparable to Watkins' Q-learning. For the tabular case, we show that 2RA Q-learning converges to the optimal policy and analyze its asymptotic mean-squared error. Lastly, we conduct numerical experiments for various settings, which corroborate our theoretical findings and indicate that 2RA Q-learning often performs better than existing methods.</dcterms:abstract>
    <dcterms:available rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2024-06-12T08:16:04Z</dcterms:available>
    <void:sparqlEndpoint rdf:resource="http://localhost/fuseki/dspace/sparql"/>
    <dspace:isPartOfCollection rdf:resource="https://kops.uni-konstanz.de/server/rdf/resource/123456789/36"/>
    <bibo:uri rdf:resource="https://kops.uni-konstanz.de/handle/123456789/70107"/>
    <dcterms:isPartOf rdf:resource="https://kops.uni-konstanz.de/server/rdf/resource/123456789/36"/>
  </rdf:Description>
</rdf:RDF>
kops.description.funding{"first":"dfg","second":"390829875"}
kops.flag.knbibliographytrue
relation.isAuthorOfPublication7a967b4e-9c2c-4559-b159-73e3c5b71c13
relation.isAuthorOfPublication5fc73a13-c03d-49f8-9668-ca67a9adf1a8
relation.isAuthorOfPublication.latestForDiscovery7a967b4e-9c2c-4559-b159-73e3c5b71c13

Dateien