include/shark/Models/ConcatenatedModel.h Source File

Go to the documentation of this file.
 //===========================================================================
 /*!
  * 
  *
  * \brief       concatenation of two models, with type erasure
  * 
  * 
  *
  * \author      O. Krause
  * \date        2010-2011
  *
  *
  * \par Copyright 1995-2017 Shark Development Team
  * 
  * <BR><HR>
  * This file is part of Shark.
  * <http://shark-ml.org/>
  * 
  * Shark is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published 
  * by the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
  * 
  * Shark is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU Lesser General Public License for more details.
  * 
  * You should have received a copy of the GNU Lesser General Public License
  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
  *
  */
 //===========================================================================
 
 #ifndef SHARK_MODEL_CONCATENATEDMODEL_H
 #define SHARK_MODEL_CONCATENATEDMODEL_H
 
 #include <shark/Models/AbstractModel.h>
 #include <boost/scoped_ptr.hpp>
 #include <boost/serialization/scoped_ptr.hpp>
 
 namespace shark {
 
 ///\brief ConcatenatedModel concatenates two models such that the output of the first model is input to the second.
 ///
 ///Sometimes a series of models is needed to generate the desired output. For example when input data needs to be 
 ///normalized before it can be put into the trained model. In this case, the ConcatenatedModel can be used to 
 ///represent this series as one model. 
 ///The easiest way to do is is using the operator >> of AbstractModel:
 ///ConcatenatedModel<InputType,OutputType> model = model1>>model2;
 ///InputType must be the type of input model1 receives and model2 the output of model2. The output of model1 and input
 ///of model2 must match. Another way of construction is calling the constructor of ConcatenatedModel using the constructor:
 /// ConcatenatedModel<InputType,OutputType> model (&modell,&model2);
 ///warning: model1 and model2 must outlive model. When they are destroyed first, behavior is undefined.
 template<class VectorType>
 class ConcatenatedModel: public AbstractModel<VectorType, VectorType, VectorType> {
 private:
     typedef AbstractModel<VectorType, VectorType, VectorType> base_type;
 public:
     typedef typename base_type::BatchInputType BatchInputType;
     typedef typename base_type::BatchOutputType BatchOutputType;
     typedef typename base_type::ParameterVectorType ParameterVectorType;
 
     /// \brief From INameable: return the class name.
     std::string name() const
     { return "ConcatenatedModel"; }
     
     
     ///\brief Returns the expected shape of the input
     Shape inputShape() const{
         return m_layers.front().model->inputShape();
     }
     ///\brief Returns the shape of the output
     Shape outputShape() const{
         return m_layers.back().model->outputShape();
     }
 
 
     void add(AbstractModel<VectorType, VectorType>* layer, bool optimize){
         m_layers.push_back({layer,optimize});
         enableModelOptimization(m_layers.size()-1, optimize);//recompute capabilities
     }
     
     ///\brief sets whether the parameters of the index-th model should be optimized
     ///
     /// If the model has non-differentiable submodels disabling those will make
     /// the whole model differentiable.
     /// Note that the models are ordered as model0 >> model1>> model2>>...
     void enableModelOptimization(std::size_t index, bool opt){
         SIZE_CHECK(index < m_layers.size());
         m_layers[index].optimize = opt;
         this->m_features.reset();
         bool inputDerivative = true;
         bool parameterDerivative = true;
         for(std::size_t k = 0; k != m_layers.size(); ++k){
             auto const& layer = m_layers[m_layers.size() - k -1];//we iterate backwards through the layers
             if( layer.optimize && (!layer.model->hasFirstParameterDerivative() || !inputDerivative)){
                 parameterDerivative = false;
             }
             if( !layer.model->hasFirstInputDerivative()){
                 inputDerivative = false;
             }
         }
 
         if (parameterDerivative){ 
             this->m_features |= base_type::HAS_FIRST_PARAMETER_DERIVATIVE;
         }
         
         if (inputDerivative){ 
             this->m_features |= base_type::HAS_FIRST_INPUT_DERIVATIVE;
         }
     
     }
     ParameterVectorType parameterVector() const {
         ParameterVectorType params(numberOfParameters());
         std::size_t pos = 0;
         for(auto layer: m_layers){
             if(!layer.optimize) continue;
             ParameterVectorType layerParams = layer.model->parameterVector();
             noalias(subrange(params,pos,pos+layerParams.size())) = layerParams;
             pos += layerParams.size();
         }
         return params;
     }
 
     void setParameterVector(ParameterVectorType const& newParameters) {
         std::size_t pos = 0;
         for(auto layer: m_layers){
             if(!layer.optimize) continue;
             ParameterVectorType layerParams = subrange(newParameters,pos,pos+layer.model->numberOfParameters());
             layer.model->setParameterVector(layerParams);
             pos += layerParams.size();
         }
     }
 
     std::size_t numberOfParameters() const{
         std::size_t numParams = 0;
         for(auto layer: m_layers){
             if(!layer.optimize) continue;
             numParams += layer.model->numberOfParameters();
         }
         return numParams;
     }
     
     boost::shared_ptr<State> createState()const{
         InternalState* state = new InternalState;
         for(std::size_t i = 0; i != m_layers.size(); ++i){
             state->state.push_back(m_layers[i].model->createState());
             state->intermediates.push_back(BatchOutputType());
         }
         return boost::shared_ptr<State>(state);
     }
     
     BatchOutputType const& hiddenResponses(State const& state, std::size_t index)const{
         InternalState const& s = state.toState<InternalState>();
         return s.intermediates[index];
     }
     
     State const& hiddenState(State const& state, std::size_t index)const{
         InternalState const& s = state.toState<InternalState>();
         return *s.state[index];
     }
 
     using base_type::eval;
     void eval(BatchInputType const& patterns, BatchOutputType& outputs)const {
         BatchOutputType intermediates;
         outputs = patterns;
         for(auto layer: m_layers){
             swap(intermediates,outputs);
             layer.model->eval(intermediates,outputs);
         }
     }
     void eval(BatchInputType const& patterns, BatchOutputType& outputs, State& state)const{
         InternalState& s = state.toState<InternalState>();
         outputs = patterns;
         for(std::size_t i = 0; i != m_layers.size(); ++i){
             if(i == 0)
                 m_layers[i].model->eval(patterns,s.intermediates[i], *s.state[i]);
             else
                 m_layers[i].model->eval(s.intermediates[i-1],s.intermediates[i], *s.state[i]);
         }
         outputs = s.intermediates.back();
     }
     
     void weightedParameterDerivative(
         BatchInputType const& patterns,
         BatchOutputType const & outputs,
         BatchOutputType const& coefficients,
         State const& state,
         RealVector& gradient
     )const{
         InternalState const& s = state.toState<InternalState>();
         BatchOutputType inputDerivativeLast;
         BatchOutputType inputDerivative = coefficients;
         gradient.resize(numberOfParameters());
         std::size_t paramEnd = gradient.size();
         for(std::size_t k = 0; k != m_layers.size(); ++k){
             std::size_t i = m_layers.size() - k -1;//we iterate backwards through the layers
             BatchInputType const* pInput = &patterns;
             if(i != 0)
                 pInput = &s.intermediates[i-1];
             
             swap(inputDerivativeLast,inputDerivative);
             //if the current layer does not need to be optimized, we just check whether we have to compute the chain rule
             if(!m_layers[i].optimize || m_layers[i].model->numberOfParameters() == 0){
                 if(i != 0) //check, if we are done, the input layer does not need to compute anything
                     m_layers[i].model->weightedInputDerivative(*pInput,s.intermediates[i], inputDerivativeLast, *s.state[i], inputDerivative);
             }else{
                 RealVector paramDerivative;
                 if(i != 0){//if we are in an intermediates layer, compute chain rule
                     m_layers[i].model->weightedDerivatives(*pInput,s.intermediates[i], inputDerivativeLast, *s.state[i], paramDerivative,inputDerivative);                  
                 }
                 else{//lowest layer only needs to compute parameter derivative
                     m_layers[i].model->weightedParameterDerivative(*pInput,s.intermediates[i], inputDerivativeLast, *s.state[i], paramDerivative);
                 }
                 noalias(subrange(gradient,paramEnd - paramDerivative.size(),paramEnd)) = paramDerivative;
                 paramEnd -= paramDerivative.size();
             }
         }
     }
 
     void weightedInputDerivative(
         BatchInputType const& patterns,
         BatchOutputType const & outputs,
         BatchOutputType const& coefficients,
         State const& state, 
         BatchOutputType& derivatives
     )const{
         InternalState const& s = state.toState<InternalState>();
         BatchOutputType derivativeLast;
         derivatives = coefficients;
         for(std::size_t k = 0; k != m_layers.size(); ++k){
             std::size_t i = m_layers.size() - k -1;//we iterate backwards through the layers
             
             BatchInputType const* pInput = &patterns;
             if(i != 0)
                 pInput = &s.intermediates[i-1];
             
             swap(derivativeLast,derivatives);
             m_layers[i].model->weightedInputDerivative(*pInput,s.intermediates[i], derivativeLast, *s.state[i], derivatives);
         }
     }
 
     virtual void weightedDerivatives(
         BatchInputType const & patterns,
         BatchOutputType const & outputs,
         BatchOutputType const & coefficients,
         State const& state,
         RealVector& gradient,
         BatchInputType& inputDerivative
     )const{
         InternalState const& s = state.toState<InternalState>();
         BatchOutputType inputDerivativeLast;
         inputDerivative = coefficients;
         gradient.resize(numberOfParameters());
         std::size_t paramEnd = gradient.size();
         for(std::size_t k = 0; k != m_layers.size(); ++k){
             std::size_t i = m_layers.size() - k -1;//we iterate backwards through the layers
             BatchInputType const* pInput = &patterns;
             if(i != 0)
                 pInput = &s.intermediates[i-1];
             
             swap(inputDerivativeLast,inputDerivative);
             //if the current layer does not need to be optimized, we just check whether we have to compute the chain rule
             if(!m_layers[i].optimize || m_layers[i].model->numberOfParameters() == 0){
                 m_layers[i].model->weightedInputDerivative(*pInput,s.intermediates[i], inputDerivativeLast, *s.state[i], inputDerivative);
             }else{
                 RealVector paramDerivative;
                 m_layers[i].model->weightedDerivatives(*pInput,s.intermediates[i], inputDerivativeLast, *s.state[i], paramDerivative,inputDerivative);
                 noalias(subrange(gradient,paramEnd - paramDerivative.size(),paramEnd)) = paramDerivative;
                 paramEnd -= paramDerivative.size();
             }
         }
     }
 
     /// From ISerializable
     void read( InArchive & archive ){
         for(auto layer: m_layers){
             archive >> *layer.model;
             archive >> layer.optimize;
         }
     }
 
     /// From ISerializable
     void write( OutArchive & archive ) const{
         for(auto layer: m_layers){
             archive << *layer.model;
             archive << layer.optimize;
         }
     }
 private:
     struct Layer{
         AbstractModel<VectorType, VectorType>* model;
         bool optimize;
     };
     std::vector<Layer> m_layers;
     
     struct InternalState: State{
         std::vector<boost::shared_ptr<State> > state;
         std::vector<BatchOutputType> intermediates;
     };
 };
 
 
 
 ///\brief Connects two AbstractModels so that the output of the first model is the input of the second.
 template<class VectorType>
 ConcatenatedModel<VectorType>  operator>>(
     AbstractModel<VectorType,VectorType, VectorType>& firstModel,
     AbstractModel<VectorType,VectorType, VectorType>& secondModel
 ){
     ConcatenatedModel<VectorType> sequence;
     sequence.add(&firstModel, true);
     sequence.add(&secondModel, true);
     return sequence;
 }
 
 template<class VectorType>
 ConcatenatedModel<VectorType>  operator>>(
     ConcatenatedModel<VectorType> const& firstModel,
     AbstractModel<VectorType,VectorType, VectorType>& secondModel
 ){
     ConcatenatedModel<VectorType> sequence = firstModel;
     sequence.add(&secondModel, true);
     return sequence;
 }
 
 
 }
 #endif