- Example Configuration
- Configuration File
- Attribute substitution
- Grouping entries into metasites
- Single user (upgrade to 3.5)
- Singularity
- glExec
- Private Networks
- Security
- TCP Updates
- Adding custom files/scripts to the glideins
- Multiple Collectors
- Multiple HTCondor Tarballs
- XSLT Plugins to extend configuration
- Glidein's Startd Advertising to Site HTCondor-CE Collector
- Running pre/post reconfigure hooks
Example Configuration
Below is an example Factory configuration xml file. Click on any piece for a more detailed description.<log_retention ></glidein>
<condor_logs max_days="14.0" max_mbytes="100.0" min_days="3.0" /></log_retention >
<job_logs max_days="7.0" max_mbytes="100.0" min_days="3.0" />
<process_logs >
<process_log extension="info" max_days="7.0" max_mbytes="100.0" min_days="3.0" msg_types="INFO" backup_count="5" compression="gz" /></process_logs >
<process_log extension="debug" max_days="7.0" max_mbytes="100.0" min_days="3.0" msg_types="DEBUG,ERR,WARN" backup_count="5" />
<summary_logs max_days="31.0" max_mbytes="100.0" min_days="3.0" />
<monitor base_dir="/var/www/html/glidefactory/monitor" flot_dir="/opt/javascriptrrd-0.6.3/flot" javascriptRRD_dir="/opt/javascriptrrd-0.6.3/src/lib" jquery_dir="/opt/javascriptrrd-0.6.3/flot" />
<monitor_footer display_txt="Legal Disclaimer" href_link="/site/disclaimer.html" />
<security key_length="2048" pub_key="RSA" reuse_oldkey_onstartup_gracetime="900" remove_old_cred_freq="24" remove_old_cred_age="30"/>
<frontends ></security >
<frontend name="vofrontend" identity="vofrontend@vofrontend.fnal.gov" ></frontends >
<security_classes ></frontend >
<security_class name="frontend" username="frontend1" /></security_classes >
<stage base_dir="/var/www/html/glidefactory/stage" use_symlink="True" web_base_url="http://factory.fnal.gov:9000/glidefactory/stage"/>
<submit base_client_log_dir="/opt/clientlogs/clients/logs" base_client_proxies_dir="/opt/clientlogs/clients/proxies" base_dir="/opt/wmsfactory/" base_log_dir="/opt/wmsfactory/logs" num_factories="1" />
<attrs>
<attr name="CONDOR_VERSION" const="True" glidein_publish="False" job_publish="False" parameter="True" publish="False" type="string" value="default" /></attrs>
<attr name="USE_CCB" const="True" glidein_publish="True" job_publish="False" parameter="True" publish="True" type="string" value="False" />
<attr name="USE_MATCH_AUTH" const="False" glidein_publish="False" job_publish="False" parameter="True" publish="True" type="string" value="True" />
<attr name="GLIDEIN_MaxMemMBs" const="True" glidein_publish="True" job_publish="False" parameter="True" publish="True" type="int" value="2500" />
<attr name="PBS_RSL" const="True" glidein_publish="False" job_publish="False" parameter="False" publish="False" type="string" value="(queue=default)(jobtype=single)(maxWalltime=$$(GLIDEIN_Max_Walltime))(memory=$$(GLIDEIN_MaxMemMBs))" />
<entries>
<entry name="EXAMPLE_ENTRY" enabled="True" auth_method="grid_proxy" trust_domain="OSG" gatekeeper="gatewayname.fnal.gov gatewayname.fnal.gov:port" gridtype="condor" rsl="$(PBS_RSL)" verbosity="std" work_dir="OSG"></entries>
<config></entry>
<max_jobs></config>
<per_entry held="1000" idle="2000" glideins="10000"/></max_jobs>
<default_per_frontend held="50" idle="100" glideins="5000"/>
<per_frontends>
<per_frontend name="FRONTEND:SECURITY_CLASS" held="50" idle="100" glideins="5000"/></per_frontends>
<release max_per_cycle="20" sleep="0.2"/>
<remove max_per_cycle="5" sleep="0.2"/>
<submit cluster_size="10" max_per_cycle="100" sleep="0.2" slots_layout="partitionable">
<submit_attrs></submit>
<submit_attr name="RequestMemory" value="2000"/><submit_attrs/>
<allow_frontends />
<attrs>
<attr name="CONDOR_ARCH" const="True" glidein_publish="False" job_publish="False" parameter="True" publish="False" type="string" value="default"/></attrs>
<attr name="CONDOR_OS" const="True" glidein_publish="False" job_publish="False" parameter="True" publish="False" type="string" value="default" />
<attr name="GLIDEIN_Site" const="True" glidein_publish="True" job_publish="True" parameter="True" publish="True" type="string" value="FNAL_EXAMPLE_SITE"/>
<attr name="GLIDEIN_CPUS" const="True" glidein_publish="True" job_publish="True" parameter="True" publish="True" type="string" value="1"/>
<attr name="GLIDEIN_Max_Walltime" const="True" glidein_publish="False" job_publish="False" parameter="True" publish="True" type="int" value="171000"/>
<attr />
<monitorgroups/>
<files/>
<infosys_refs />
<files>
<file absfname="/usr/conf/sethome.source" after_entry="False" const="True" executable="False" untar="False" wrapper="True" /></files>
<condor_tarballs>
<condor_tarball arch="default" base_dir="/opt/wmscollector/" os="default" tar_file="/var/www/html/glidefactory/stage/glidein_v2_4/condor_bin_default-default-default.a83ePm.tgz" version="default"/></condor_tarballs>
<monitoring_colectors>
<monitoring_colector DN="/DC=org/DC=doegrids/OU=Services/CN=factmoncollector.fnal.gov" node="factmoncollector.fnal.gov" secondary="False" group="default" /></monitoring_colectors>
<monitoring_colector DN="/DC=org/DC=doegrids/OU=Services/CN=factmoncollector.fnal.gov" node="factmoncollector.fnal.gov:9620-9819" secondary="True" group="default" />
<monitoring_colector DN="/DC=org/DC=doegrids/OU=Services/CN=factmoncollector2.fnal.gov" node="factmoncollector2.fnal.gov" secondary="False" group="ha" />
<monitoring_colector DN="/DC=org/DC=doegrids/OU=Services/CN=factmoncollector2.fnal.gov" node="factmoncollector2.fnal.gov:9620-9919" secondary="True" group="ha" />
The configuration file
The configuration file is a XML document. It contains both global arguments as well as specific configuration to each entry point. At least one entry point must be specified in the configuration file.
The tags of the XML configuration file are described below. Each is given a designation:
- Required: You must change or examine this in order for the Factory to function correctly.
- Recommended: The installer provides a good default, but you should examine this attribute to make sure it is correct for your installation.
- Optional: The installer-provided default is likely correct for your installation. Change this only if your particular configuration requires special treatment or fine-tuning.
Global arguments
Global arguments are common to all entry points but can be overridden by individual entry point configuration.
-
<glidein ... >
Required: The main tag of the Factory configuration. See below for the parameters for this tag:
-
<glidein glidein_name="your name">Required: The name of the configuration. It will be used to advertise the entry points, will be defined as HTCondor glidein attribute GLIDEIN_Name, and is used also to create the directory names. Choose a short name that describes the set of Grid resources . It represents and append a version number (like "fnalcms_1"). Starting with v2.0 of GlideinWMS, you can use the Factory reconfig tool to make changes to the Factory configuration. You will only need new configuration for the factories during major upgrade. For more details refer the Glidein Factory management section
-
<glidein factory_name="your name">Recommended: Changing this value from the name of the machine allows you to move the Factory without disrupting the system.
-
<glidein schedd_name="schedd name[,schedd name]*">Recommended: If you want to use multiple HTCondor schedds or you don't like the default name, you definitely need to set this. If you specify more than a single schedd, the various entries will be equally spread among all the listed schedds. Possible values include (but are not limited to):
"myschedd@mymachine.mydomain"
"myschedd_g1@mymachine.mydomain,myschedd_g2@mymachine.mydomain,myschedd_g3@mymachine.mydomain"
-
<glidein factory_collector="collector_name:port">Recommended: You can have a dedicated collector serving just for message passing with the Frontend.
If this is not set, the default collector will be used, which also happens to be used to advertise the internal HTCondor-G ClassAds. That's undersirable both for security reasons, and also because frequent HTCondor-G updates tend to load it heavily. -
<glidein loop_delay="seconds" advertise_delay="nr" >Optional: Defines how active the glidein Factory should be. The glidein Factory works in polling mode. loop_delay defines how much time should pass between each polling loop, with the collector being updated every advertise_delay loops.
-
<glidein restart_attempts="nr" restart_interval="seconds" >Optional: Defines how many times restart_attempts should be applied within restart_interval seconds for an entry if the entry crashes.
-
<recoverable_exitcodes="N,M,..." >Optional: Comma separated list of condor HoldReasonCode/HoldReasonSubCode that are going to be retried. For each code, a lis of space separated subcodescan be specified. Subcodes are optional. All codes must be integers. Example: recoverable_exitcodes="c1, c2 s1, c3 s1 s3 s4, ..."
-
<glidein advertise_with_tcp="True|False" >Optional: Defines if the Factory should use TCP to advertise its ClassAds.
-
<glidein advertise_with_multiple="True|False" >Optional: Defines if the Factory should use -multiple to advertise its ClassAds.
-
<glidein advertise_pilot_accounting="True|False" >Optional: Defines if the Factory should condor_qedit pilot jobs after they finishes to add accounting information. A ClassAd MONITOR_INFO will be added and jobs will be left in the Complete state for 12 hours.
-
-
<glidein><log_retention><process_logs><process_log max_days="max days" min_days="min days" max_bytes="max bytes" type="INFO" backup_count="backup count" compression="gz"/>
The admin can configure one or more logs with any combination of following log message types:
- INFO: Informational messages about the state of the system.
- DEBUG: Debug message. These are additional informational messages that describe code execution in detail.
- ERR: Error messages. These may include tracebacks.
- WARN: Warning messages. These warn of conditions that were found that don't necessarily cause abnormal execution.
- ADMIN: This is a special setting only applicable to Factory logs (and not entry logs) that contains admin messages, such as reconfig comments. This will create a new log called factoryadmin.extension.log. Currently, it cannot be combined with other message types.
Log Retention and Rotation Policy:
Log files are rotated based on a time and size of the log files as follows:- If the log file size reaches max_bytes it will be rotated (NOTE: the value will be truncated and 0 means no rotation).
- If the log file size is less that max_bytes but the file is older than max_days, it will be rotated.
- min_days is not used and is there for backwards compatibility.
- Rotated files may be compressed. Supported compressions are Gzip "gz" and Zip "zip". Default is no compression (empty string).
- After rotation, recent number of backup_count files will be kept and older ones are deleted. Defaults to 5.
-
<glidein><condor_tarballs><condor_tarball os="os" arch="arch" base_dir="directory" version="condor version"/>
Required: Where to find the HTCondor binaries. You can list as many as you need, but at least one is required. This lets you configure glideins for different sites to use different version of HTCondor binaries based on architecture, OS of the worker nodes that could be found on the site.
It is recommended to have one default entry with os="default" arch="default" version="default".
To configure multiple tarballs, OS, arch and version accept a comma separated list. See multiple tarballs for more detailed instructions on supporting tarballs for multiple architectures. -
<glidein><submit base_dir="directory" base_log_dir="log directory" base_client_log_dir="client log directory" base_client_proxies_dir="directory where proxies will be stored" num_factories="1"/>
Recommended: Where to create the glidein submit directory. The default is the user home directory. Log directories can be configured independent of the base directory using options mentioned above. num_factories is a scaling factor used to scale down max_glideins, max_held, and max_idle for the entries.
-
<glidein><stage base_dir="web dir" web_base_dir="URL"/>
Recommended: These two define where the Web server directories are located.
The defaults are reasonable, but you may have different needs. -
<glidein><monitor base_dir="web dir" javascriptRRD_dir="web dir" flot_dir="web dir" jquery_dir="web dir>" >
Recommended: The base_dir defines where the monitoring web are is.
The other entries point to where javascriptRRD, Flot and JQuery libraries are. -
<glidein><monitor_footer display_txt="Legal Disclaimer" href_link="/site/disclaimer.html" />
OPTIONAL: If the display text and link are configured, the monitoring pages will display the text/link at the bottom of the page.
-
<glidein><security reuse_oldkey_onstartup_gracetime="900" remove_old_cred_age="30" remove_old_cred_freq="24" pub_key="RSA"/>
The Factory can remove old credential files saved on disk. You can set the min age (in days) the credential needs to be before it can be deleted and the frequency (in hours) to perform the removal. If you set the frequency to less than zero, the clean up functionality is disabled.
-
<glidein><security><frontends><frontend name="FrontendName" identity="username_usedby_factory@factory_hostname" />
Recommended: This configures the Frontend. Frontend on the Frontend hosting machine should have the same name as mentioned in the 'name'. Identity tells the Factory the username under which the Factory should map the given Frontend to. If this does not match the Frontend configuration, the Factory will drop all requested glideins.
-
<glidein><security><frontends><frontend><security_classes><security_class name="frontend" username=">username_usedby_factory"/>
Required: Tells the class and user name the Factory uses for this Frontend.
- <security_class name="frontend" username="username_usedby_factory"/> -
<glidein><attrs><attr name="attr name" value="value" const="True" parameter="True" publish="True" glidein_publish="True" comment="comment" />
Attributes you want to publish that effect all the Factory entries
To set specific attributes to an entry point, set them in /glidein/entries/entry/attrs section.
Table below describes the <attrs ... > tag.
Attribute Name
Attribute Description
name
Name of the attribute
value
Value of the attribute
const
Set to True if the attribute is a constant. If so, Glidein Frontend can not change it. If set to const, the attribute will be available in the constants file created in the staging area.
parameter
Set True if the attribute should be passed as a parameter. Always set this to True unless you know what you are doing. NOTE As of 3.4.6 in the Factory, publish has to be set to True and const to False, otherwise the attribute will not published in the parameter list.
publish
If set to True, the attribute will be published in Factory's ClassAd
glidein_publish
If set to True, the attribute will be available in the condor_startd's ClassAd (and the glidein condor configuration). Used only if parameter is True.
job_publish
If set to True, the attribute will be available in the user job's environment. Used only if parameter is True.
comment
You can specify description of the attribute here.
type
Type of the attribute. Supported types are 'int', 'string' and 'expr'. Typeexpr is equivalent to condor constant/expression in condor_vars.lst
These are used by the VO Frontend matchmaking and job matchmaking.
Example attributes are:<attrs> <attr name="VOpilot" value="CMS" publish="True" parameter="True" const="True" glidein_publish="True" comment=“A test attribute”/> <attr name="CondorVersion" value="v6.9.1" publish="True" parameter="True" const="True" glidein_publish="True"/> </attrs>
A list of all the attributes can be found on the dedicated configuration variables page.
-
<glidein><files><file absfname="script name" executable="True" period="period in seconds" prefix="cron prefix" after_entry="True" type="file type" comment="comment" />
Custom files configuration is discussed in the Adding custom files/scripts to the glideins paragraph in the advanced section below.
Custom files (how to write and use them), are discussed in the custom scripts page.
The other arguments are for advanced admins only, and are
explained in a dedicated section.
Entry point arguments
The following are arguments that are specific to each entry point. They override the global arguments if present.
-
<glidein><entries><entry name="entry name">
Required: Each entry point will have its own root tag with parameters:
-
<glidein><entries><entry name="entry name">Required: Specify an easy name to remember, that will identify this entry for display purposes and for specifying using command line tools.
-
<glidein><entries><entry name="entry name" auth_method="grid_proxy">Required: The authentication method this entry supports. It is advertised to the Frontends to show which credentials are required for submission. Valid values: grid_proxy, voms_proxy, key_pair, cert_pair, username_password, vm_type, vm_id, project_id, scitoken
Multiple authentication methods may be needed and there may be reqirements on the authentication method depending on the type of entry. E.g. ec2 requires a credential pair (key_pair, cert_pair, username_password) and the VM id and type. BOSCO requires a key_pair and a grid_proxy. The authentication method then must be <entry auth_method="grid_proxy+project_id" >.
When multiple methods are needed these can be additional entry values: <entry vm_type="vmtype" or vm_id="vmid" >
OR
can be added to the authentication method so the Frontend is required to pass them: <entry auth_method="key_pair+vm_id+vm_type" >
-
<glidein><entries><entry name="entry name" trust_domain="OSG">Required: The trust domain for the entry. This is not interpreted by the Factory code, only used to show what credentials are valid. For example, there may be two ec2-type entries for two different clouds with the authentication method of "key-pair". This shows allows the Frontend to map a key pair to a particular cloud. Or in BOSCO you can use it to select the key to use with a specific cluster.
-
<glidein><entries><entry name="entry name" gatekeeper="gatekeeper">Required: The identifier of your Grid/BOSCO resource (like "cmsitbsrv01.fnal.gov/jobmanager-condor"). BOSCO resources are USER@FQDN, e.g. cmsuser@cmscluster.fnal.gov.
-
<glidein><entries><entry name="entry name" rsl="rsl">Please check the Grid site documentation and/or ask the Grid site administrator for the proper rsl and queue name for the site.
(example: '(condorsubmit=(universe vanilla)(requirements \"(ISMINOSAFS=?=True)\"))').
NOTE: If the auth_method contains "+project_id" for a TeraGrid entry, the string "(project=TG_PROJECT_ID)" will be added by the Factory and populated with the project id passed in the request. -
<glidein><entries><entry name="entry name" gridtype="grid type [default: condor]">Optional: The default condor is for HTCondor-CE Gatekeepers. Other values are ec2 or one of the batch types supported by HTCondor: batch pbs (for PBS and SLURM), batch lsf, batch sge, batch condor. This tag can specify additional HTCondor Grid types.
-
<glidein><entries><entry name="entry name" work_dir="WN dir">Recommended: This argument defines where the glidein should run once on the worker node.
Options are: ".", Condor (CONDOR), OSG, TMPDIR, AUTO
- ".": run in pwd
-
Condor (CONDOR): will run in $_CONDOR_SCRATCH_DIR
- OSG: will run in $OSG_WN_TMP
- TMPDIR: will run in $TMPDIR
-
AUTO: will try to determine which direcotry has enogh space
(at least 1 GB of free space). Directory checked are
$_CONDOR_SCRATCH_DIR, $OSG_WN_TMP $TG_NODE_SCRATCH,
$TG_CLUSTER_SCRATCH, $SCRATCH $TMPDIR, $TMP, $PWD (in this
order)
- ".": run in pwd
-
<glidein><entries><entry name="entry name" bosco_dir="BOSCO dir [default: bosco]">Optional: This argument defines the directory where BOSCO (some HTCondor executables and Glite BLAHP) are installed on the BOSCO resource submit host. This is the node specified in the gatekeeper attribute when using "batch ..." grid type. This is ignored for other grid types. The path specified is relative to $HOME. Note: This must be the same directory used for the BOSCO installation on the remote resource (bosco_cluster_add). A different directory is useful only to avoid interferences if the same home directory is shared and mounted by multiple resources.
-
<glidein><entries><entry name="entry name" proxy_url="Proxy URL">Recommended: If you have a Web cache you can use, you set it here (like "cmsitbsquid002.fnal.gov:3128"). On OSG resources, you can set it to "OSG", and the default OSG squid will be used. If you cannot use any Web cache server, you can skip this argument (the default is not to use caching). If defined, the user jobs will be able to use it as "GLIDEIN_Proxy_URL" environment variable.
-
<glidein><entries><entry name="entry name" schedd_name="schedd name">Optional: If you have an entry that needs a dedicated schedd, you can set it here (to something like "myveryspecialschedd@mymachine.mydomain")
-
<glidein><entries><entry name="entry name" enabled="True/False">Optional: You can define an entry point even if you do not plan to use it right away. The entry point directory will be created independently of the enabled flag, but will only be used by the glidein Factory if it is set to True. (Defaults to True).
-
<glidein><entries><entry name="entry name" verbosity="std/fast/nodebug">Optional: Specify the verbosity level and termination time in case of validation errors:
- std (default) – reasonable verbosity (including the condor log files) and 20min sleep in case of error (to reduce the damage resulting from broken nodes)
- fast – same verbosity as std, but will only wait 2 mins before terminating in case of error (good for debugging)
- nodebug – very low verbosity, if you want to save on disk space
-
-
<glidein><entries><entry name="entry name">
<config>
<max_jobs num_factories="3">
<per_entry held="1000" idle="2000" glideins="10000"/>
<default_per_frontend held="50" idle="100" glideins="5000"/>
<per_frontends>
<per_frontend name="FRONTEND:SECURITY_CLASS" held="50" idle="100" glideins="5000"/>
</per_frontends>
<max_jobs/>
<release max_per_cycle="20" sleep="0.2"/>
<remove max_per_cycle="5" sleep="0.2"/>
<submit cluster_size="10" max_per_cycle="100" sleep="0.2" slots_layout="partitionable">
<submit_attrs>
<submit_attr name="RequestMemory" value="2000"/>
<submit_attr name="+GlideinSkipIdleRemoval" value="True" all_grid_types="True"/>
</submit_attrs>
</submit>
</config>The config section controls the glidein policies for the entry.
-
The max_jobs section limits the number of glidins held, idle,
and total (including running).
- <per_entry ... > specifies the limits per entry.
- <default_per_frontend ... > specifies default limits for each frontend-security class (for this entry).
- <per_frontend ... > overrides the limits for a specific frontend-security class.
- Release regulates how many glideins are released per cycle.
- Remove regulates how many glideins are removed per cycle.
-
Submit limits how fast the glideins are submitted and controls
the glideins type. Each main loop (cycle) the glideins are
submitted using multiple condor_submit commands
- cluster_size limits how many glideins are submitted with one condor_submit command (if more glideins need to be submitted, multiple condor_submit invocations are used).
- sleep is the minimum wait between two condor_submit commands (actual time is longer).
- max_per_cycle limits how many glideins to submit every cycle
- slots_layout tells what kind of glideins the entry supports: "fixed" number of slots; or "partitionable" slots, which enables HTCondor's partionable/dynamic slots. The number of cores claimed by the glidein can be controlled by the GLIDEIN_CPUS variable. GLIDEIN_ESTIMATED_CPUS can provide an estimate used for matching when the number of cores is discovered automatically.
- <submit_attr ... > specifies additional HTCondor attributes to add to the HTCondor submit file that the Factory uses to submit pilots to a site. These attributes are only added when the gridtype is "batch SOMETHING" (using BOSCO) or "condor" or "ec2" or "gce". The attributes are ignored for all other grid types unless the attribute 'all_grid_types' is set to true.
-
The following submit attribute can be added to disable the
temporary removal of idle glideins:
<submit_attr name="+GlideinSkipIdleRemoval" value="True" all_grid_types="True"/>
-
The max_jobs section limits the number of glidins held, idle,
and total (including running).
-
<glidein><entries><entry name="entry name"><attrs><attr name="attr name" value="value" const="True" parameter="True" publish="True" glidein_publish="True" comment="comment"/>
Attributes you want to publish into the HTCondor ClassAd. These are used by the VO Frontend matchmaking and job matchmaking.
Example attributes are:
<attr name="HasMySoftware" value="True" publish="True" parameter=" True" const="True" glidein_publish="True" comment=“My users cannot live without” /> <attr name="OS" value="Linux" publish="True" parameter="True" const="True" glidein_publish="True"/>
Other pre-defined attributes are listed below:-
<glidein><entries><entry name="entry name"><attrs><attr name="GLIDEIN_Site" value="value" const="True" parameter="True" publish="True"/>Recommended: This defines the glidein attribute GLIDEIN_Site, both for use of the Frontend and for the use of the job negotiation. Logically defining a site is useful, so that you can change entry points but the user jobs do still known where they are running. If not specified, it defaults to the entry point name in the startd ClassAd.
-
<glidein><entries><entry name=entry name"><attrs>Recommended: Select a non-default HTCondor binary.
<attr name="CONDOR_VERSION" value="os" type="string" const="True" parameter="True" publish="False" />
<attr name="CONDOR_ARCH" value="arch" type="string" const="True" parameter="True" publish="False"/>
<attr name="CONDOR_OS" value="version" type="string" const="True" parameter="True" publish="False"/>
The entry will default to CONDOR_OS="default" CONDOR_ARCH="default" CONDOR_VERSION="default", if not otherwise defined. -
<glidein><entries><entry name="entry name"><attrs><attr name="X509_CERT_DIR" value="value" const="True" parameter="True" glidein_publish="False" job_publish="True" publish="True"/>Recommended for BOSCO resources: BOSCO resources normally have no OSG software installed, so it is safer to publish the location of the CA certificates because the job may not be able to find them otherwise.
-
<glidein><entries><entry name="entry name"><attrs><attr name="GLIDEIN_CPUS" value="value" const="True" parameter="True" publish="True"/>Recommended: Specifies how many CPUs (cores) to claim on the compute node
Valid values are 1..N and "auto", "slot" or "node" (corresponding to 0, 0, -1) which means to use all cores on the slot or on the node (see it on the configuration variables page for more). The entry will default to 1 if not otherwise defined.
-
-
<glidein><entries><entry name="entry name"> <allow_frontends> <allow_frontend name="vofrontend_name" security_class="security_class_name"/>
This argument allows you to create a whitelist of VO Frontends that can access this entry point. If this tag is blank or missing, it is assumed that all VO Frontends can submit glideins to this entry point. However, if any allow_frontend tags exist, the entry point will only allow those Frontends to submit glideins. The name of the Frontend must match the name given in the security class above in the configuration.
For each Frontend, you must tell it which security classes (e.g. proxies) can use the Frontend. The Factory will only submit glideins on behalf of these security classes. If you want all security classes to be allowed, you can put "All" in this field. Otherwise, it must match the security class configuration higher up in the xml.
-
<glidein><entries><entry name="entry name"> <infosys_refs><infosys_ref ref="filename" server="SERVER" type="RESS/BDII"/>
This argument is placed here by the installers based on information from BDII/RESS. This gives you information on where the server's information was retrieved from. It can also be used to retrieve downtime information from RESS/BDII.
Attribute substitution
Attribute substitution can be used to parametrize the Factory configuration.
Anytime one of the following two bits are found inside the configuration file
- $$(attr_name)
- $(attr_name)
the value of the named attribute is inserted in that place.
The $$ variant will interpret the type of the attribute,
quoting its value as appopriate, while the $ requires a
string type and will use it as-is.
They can appear in any string of the configuration file.
If an attribute is defined both in the main and the entry section, the
section one will prevail.
This can be used to e.g. define a generic expression in the global
section, and customize it on a entry by entry basis by only
re-defining the relevant bits.
The attribute expansion also works recursively, as long as there are
no loops.
$(DOLLAR) can be used to represent the $ sign.
For example, with
<attr name="GLIDEIN_MaxMemMBs" const="True" glidein_publish="True" job_publish="False" parameter="True" publish="True" type="int" value="2500" />
<attr name="GLIDEIN_Max_Walltime" const="True" glidein_publish="False" job_publish="False" parameter="True" publish="True" type="int" value="171000"/>
<attr name="PBS_RSL" const="True" glidein_publish="False" job_publish="False" parameter="False" publish="False" type="string" value="(jobtype=single)(maxWalltime=$$(GLIDEIN_Max_Walltime))(memory=$$(GLIDEIN_MaxMemMBs))" />
... rsl="(queue=default)$(PBS_RSL)" ...
will be expanded into
... rsl="(queue=default)(jobtype=single)(maxWalltime=171000)(memory=2500)" ...
Please notice that the Factory will throw an error on reconfig if a referenced attribute is not defined.
Grouping entries into metasites
Starting in v3.2.20, entries with similar configuration can be grouped in <entry_sets> to form what we call a metasite. <entry_sets> starts right after the <entries> tag is closed. A metasite is defined by starting an <entry_set>, which then contains the common configuration for the different entries, and then an <entries> tag containing the different entries for the metasite.
The following is an example of two different entry set containing two separate entries.
The attributes auth_method, gridtype, and trust_domain must be the same in all entry elements (they are used for credential generation in the Frontend).
Only the metasite will be advertized as a glideresource ClassAd, so the Frontend will only see one element (entry). The Frontend will apply limits transparently as before, and the Factory as well. Right before glidein submission, the Factory will detect there are multiple CE where the work can be send (multiple submission files) and it will count the running+idle glideins for each CE and send jobs to the one with less. The submission file is added to the job condor submit file for accounting purposes (GlideinEntrySubmitFile ClassAd).
The <entry_selection> parameter determines how the factory decides between the subentries of the metasite. By default the frontend requests are equally split among subentries until they are filled. Limits for the subentries correspond to the limits for the metasites divided by the number of subentries. More algorithms may be implemented in the future.
</entries>
<entry_sets>
<entry_set alias="T2_US_UCSD" enabled="True">
<config></entry_set>
<max_jobs></config>
<default_per_frontend glideins="5000" held="50" idle="100"/></max_jobs>
<per_entry glideins="10000" held="1000" idle="4000"/>
<per_frontends>
</per_frontends>
<release max_per_cycle="20" sleep="0.2"/>
<remove max_per_cycle="5" sleep="0.2"/>
<restrictions require_voms_proxy="False"/>
<entry_selection algorithm_name="Default" >
<submit cluster_size="10" max_per_cycle="25" sleep="2" slots_layout="partitionable">
<submit_attrs></submit>
</submit_attrs>
<allow_frontends>
</allow_frontends>
<attrs>
<attr name="CONDOR_ARCH" const="False" glidein_publish="False" job_publish="False" parameter="True"</attrs>
publish="True" type="string" value="default"/>
<attr name="CONDOR_OS" const="False" glidein_publish="False" job_publish="False" parameter="True"
publish="True" type="string" value="default"/>
<attr name="GLIDEIN_Site" const="True" glidein_publish="True" job_publish="True" parameter="True" publish="True" type="string" value="UCSD"/>
<attr name="GLIDEIN_Supported_VOs" const="True" glidein_publish="True" job_publish="True" parameter="True" publish="True" type="string" value="CMS,ATLAS,Fermilab,LIGO,NWICG,HCCLONG"/>
<attr name="GLIDEIN_CPUS" const="True" glidein_publish="False" job_publish="True" parameter="True" publish="True" type="string" value="8"/>
<files>
</files>
<infosys_refs>
</infosys_refs>
<monitorgroups>
</monitorgroups>
<entries>
<entry name="CMSHTPC_T2_US_UCSD_gw2" auth_method="grid_proxy" comment="Created new multicore entry on 2016/03/29 -- Marty" enabled="True" gatekeeper="osg-gw-2.t2.ucsd.edu osg-gw-2.t2.ucsd.edu:9619" gridtype="condor" proxy_url="OSG" trust_domain="grid" verbosity="std" work_dir="Condor"></entries>
</entry>
<entry name="CMSHTPC_T2_US_UCSD_gw4" auth_method="grid_proxy" comment="Created new multicore entry on 2016/03/29 -- Marty" enabled="True" gatekeeper="osg-gw-4.t2.ucsd.edu osg-gw-4.t2.ucsd.edu:9619" gridtype="condor" proxy_url="OSG" trust_domain="grid" verbosity="std" work_dir="Condor">
</entry>
<entry_set alias="T2_CH_CERN" enabled="True"></entry_sets>
<config></entry_set>
<max_jobs></config>
<default_per_frontend glideins="5000" held="50" idle="100"/></max_jobs>
<per_entry glideins="10000" held="1000" idle="4000"/>
<per_frontends>
</per_frontends>
<release max_per_cycle="20" sleep="0.2"/>
<remove max_per_cycle="5" sleep="0.2"/>
<restrictions require_voms_proxy="False"/>
<entry_selection algorithm_name="Default" >
<submit cluster_size="10" max_per_cycle="10" sleep="2"
slots_layout="fixed">
<submit_attrs></submit>
</submit_attrs>
<allow_frontends>
</allow_frontends>
<attrs>
<attr name="GLIDEIN_CMSSite" const="True" glidein_publish="True" job_publish="True" parameter="True" publish="True" type="string" value="T2_CH_CERN"/></attrs>
<attr name="GLIDEIN_CPUS" const="True" glidein_publish="False" job_publish="True" parameter="True" publish="True" type="string" value="8"/>
<attr name="GLIDEIN_Country" const="True" glidein_publish="True" job_publish="True" parameter="True" publish="True" type="string" value="CH"/>
<attr name="GLIDEIN_MaxMemMBs" const="True" glidein_publish="False" job_publish="True" parameter="True" publish="True" type="int" value="22000"/>
<attr name="GLIDEIN_Max_Walltime" const="True" iglidein_publish="False" job_publish="False" parameter="True" publish="True" type="int" value="257400"/>
<attr name="GLIDEIN_ResourceName" const="True" glidein_publish="True" job_publish="True" parameter="True" publish="True" type="string" value="CERN-PROD"/>
<attr name="GLIDEIN_Retire_Time" const="True" glidein_publish="False" job_publish="False" parameter="True" publish="True" type="int" value="108000"/>
<attr name="GLIDEIN_Site" const="True" glidein_publish="True" job_publish="True" parameter="True" publish="True" type="string" value="CERN"/>
<attr name="GLIDEIN_Supported_VOs" const="True" glidein_publish="False" job_publish="False" parameter="True" publish="True" type="string" value="CMS"/>
<files>
</files>
<infosys_refs>
</infosys_refs>
<monitorgroups>
</monitorgroups>
<entries>
<entry name="CMSHTPC_T2_CH_CERN_ce301" auth_method="grid_proxy" comment="Converted to multicore 2016-03-29-Vassil" enabled="True" gatekeeper="ce301.cern.ch:8443/cream-lsf-grid_cms" gridtype="cream" rsl="WholeNodes = False; HostNumber = 1; CPUNumber = 8" trust_domain="grid" verbosity="std" work_dir="."></entries>
</entry>
<entry name="CMSHTPC_T2_CH_CERN_ce302" auth_method="grid_proxy" comment="Converted to multicore 2016-03-31-Vassil" enabled="True" gatekeeper="ce302.cern.ch:8443/cream-lsf-grid_cms" gridtype="cream" rsl="WholeNodes = False; HostNumber = 1; CPUNumber = 8" trust_domain="grid" verbosity="std" work_dir=".">
</entry>
...
Advanced topics
While the above is enough for setting up a personal glidein pool on the local area network, you will need to do more fine tuning when deploying a larger one. In this section, the various advanced aspects of glidein pools will be presented.
Migration process for the removal of dependency of condor_root_switchboard (Upgrade to v3.5)
Starting v3.5, GlideinWMS stopped supporting Globus GRAM GT2/GT5 and Factory and all pilot jobs run under a single user (gfactory) to eliminate the need of switchboard and setuid/user-switching. Thus, all glideins will run using the Factory user (no more separated users per-VO). For the log and secure files, the same directory structure remains the same, only the ownership will change.
The transition for the new jobs is clear. But either for upgrading an
existing Factory version from v3.4.x or ealier (even if there are no
glideins) or for the old glideins that are still running, there is a
process which will ease the migration process for this purpose. This
process implies the execution of a script which:
1. Will change the ownership of the log and security file
directories
2. Will run a sed command on the condor job_queue files to change
ownership of jobs inside condor
First of all, to check what the script does, you must be located in
factory/tools of your GWMS installation and execute like (add --debug
for a more verbose output):
sudo fact_chown --user=gfactory --group=gfactory --test --backupThis execution should show you how the script works without making changes. It will also create a backup of the files that is going to modify in the current working directory. Please, notice that if you create the backup now, make sure that both condor and the factory are not running. Otherwise the backup might be out of date.
If you know what you are doing and, you would like actually to do the migration, please follow the following steps:
1. Stop Factory and stop condor. (This step is very important condor must not run when the script is being executed!)
2. Run the script fact_chown indicating the user and the group which the directories and their content will take the ownership. Add also the flag --backup to have backup of everything (or not if you are happy with a backup created previously). Example with backups (notice the absence of --test):
sudo fact_chown --user=gfactory --group=gfactory --backup3. Start condor and Factory and check the changes
Remember:
- Run the script with no options if you don't want to have backups.
- If you see the permissions have changed but not the ownership, make
sure you have executed the script with condor stopped.
- If pilots go held after executing the script, make sure that you
have executed the steps as indicated before. If condor was running
during the script execution: stop condor, run the script again and
release the jobs.
After executing these steps, you should see your jobs changed user in
condor_q to the new owner. Also, the onwership to Factory user
(gfactory) of:
- Log files directories in /var/log/gwms-factory/client/
- Security files directory in /var/lib/gwms-factory/client-proxies/
- All files and folders in the corresponding folder's tree indicated
before:
-
/var/log/gwms-factory/client/<user_username>/glidein_gfactory_instance/
- /var/lib/gwms-factory/client-proxies/<USER_DIRS>
To revert to a previous version of GlideinWMS, you need to
restore the job_queue files and change back the permissions of the log
directories. As commented previously, those operations need to be
performed with both condor and GlideinWMS stoppped.
- To restore the job_queue condor files: untar all the
schedd_glideins1_job_queue.log.tar.gz files, and move the
job_queue.log file into its original location. You can print it when
you run the fact_chown script (you can also run it with --test and
--skipchown)
- To restore the directory permissions for the logfiles, you can run
the following command:
for dir in /var/log/gwms-factory/client/user_fe*; do USERARR=(${dir//_/ }); USER=${USERARR[1]}; chown -R $USER:$USER $dir; done
CCB - Condor Connection Broker
The detailed description of CCB is beyond the scope of this manual and
you should refer to the HTCondor documentation available
here. , where you will find only the parameters needed to enable it in
the glideins.
To use HTCondor with CCB, you need to specify:
[<entries><entry>]
<attrs>
<attr name="USE_CCB" value="True" publish="False" parameter="True"/>
and you are done. Just make sure you follow the suggested scalability guidelines described in the HTCondor manual.
Integration with Singularity
-
An entry can control the use of Singularity by setting GLIDEIN_SINGULARITY_REQUIRE to NEVER (Singularity is not supported), OPTIONAL or PREFERRED (capable of Singularity but it is not enforced), REQUIRED (jobs must run with Singularity) or REQUIRED_GWMS (jobs must run with Singularity and use the GWMS wrapper scripts). This last option is the only one that really enforces Singularity, but is not compatible with VOs that currently self-manage Singularity with custom scripts, like OSG and CMS. The attribute can be set in the general or entry <attrs> section of the Factory configuration: <attr name="GLIDEIN_SINGULARITY_REQUIRE" const="True" glidein_publish="True" job_publish="True" parameter="True" publish="True" type="string" value="REQUIRED"/>.
The value of GLIDEIN_SINGULARITY_REQUIRE is used by the Frontend to provision resources and by the Glidein to negotiate the use of Singularity with the jobs. An entry where Singularity is OPTIONAL or PREFERRED will allow to run without Singularity if the job prefers so or if Singularity fails. An entry requiring Singularity (REQUIRED, REQUIRED_GWMS) will not allow to run without Singularity and the Glidein will fail if Singularity fails (e.g. the singularity binary or the image are not found). -
If Singularity is allowed or required (all values of GLIDEIN_SINGULARITY_REQUIRE except NEVER), singularity is expected on all worker nodes of the entry. The Glidein will search a singularity binary in the PATH and then invoking Modules (if available). SINGULARITY_BIN can not be used any more to control the use of Singularity in the Factory. Use GLIDEIN_SINGULARITY_REQUIRE instead. SINGULARITY_BIN is only considered in the search for singularity: the keyword OSG (default) starts the search form the OSG provided binary, the keyword PATH starts the search form the system path. See the SINGULARITY_BIN description for more.
-
NOTE: For compatibility with previous versions and to ease the migration to the use of GWMS scripts, GLIDEIN_SINGULARITY_REQUIRE=REQUIRED works only if Singularity is managed via GWMS. GLIDEIN_Singularity_Use=GWMS_DISABLE in the Frontend configuration (default) allows VOs to manage Singularity independently from GWMS. GLIDEIN_SINGULARITY_REQUIRE=REQUIRED_GWMS will not accept jobs where GLIDEIN_Singularity_Use=GWMS_DISABLE. It is a stronger enforcement, but will not allow VOs managing Singularity on their own. Jobs with GWMS_DISABLE will not trigger Glideins on entries with REQUIRED_GWMS, and the Glidein will fail at setup setup if somehow this combination happens.
-
The Factory can specify a dictionary of available Singularity images using SINGULARITY_IMAGES_DICT. This can be overridden by the Frontend, unless the attribute is declared constant (const="True"). <attr name="SINGULARITY_IMAGES_DICT" const="False" glidein_publish="True" job_publish="True" parameter="True" publish="True" type="string" value="rhel7:/cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el7:latest,rhel6:/cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el6:latest,rhel8:/cvmfs/singularity.opensciencegrid.org/opensciencegrid/osgvo-el8:latest"/>.
-
The Factory can specify bind-mounts for Singularity images using GLIDEIN_SINGULARITY_BINDPATH_DEFAULT. This can be overridden by the Frontend, unless the attribute is declared constant (const="True"). If one or more of the specified paths does not exist on the node, it will be removed from the list. <attr name="GLIDEIN_SINGULARITY_BINDPATH_DEFAULT" const="False" glidein_publish="True" job_publish="True" parameter="True" publish="True" type="string" value="/cvmfs,/src_path:/dst_path" />.
-
The use of Singularity is negotiated according to the table below, using GLIDEIN_SINGULARITY_REQUIRE (from the Factory) and GLIDEIN_Singularity_Use (from the Frontend), resulting in DISABLE (do not use the GWMS Singularity mechanisms), FAIL (do not provision on this entry, do not match with these Glideins or fail the Glidein), NEVER (do not use Singularity), PREFERRED (use Singularity but allow to fall-back to no Singularity if something goes wrong, e.g. no image is found) or REQUIRED (use Singularity and fail the Glidein if it cannot run with Singularity).
The specification of a Singularity image in a job (+SingularityImage) removes the possibility to fall-back to no Singularity if something goes wrong. The table below combines GLIDEIN_SINGULARITY_REQUIRE values (first row, in bold) and GLIDEIN_Singularity_Use values (first column, in bold) and the results are in the intersection. REQUIRED_GWMS is in the same column as REQUIRED and the only difference is with DISABLE_GWMS, where it causes FAIL instead of DISABLE.NEVER
OPTIONAL
PREFERRED
REQUIRED (REQUIRED_GWMS)
NEVER
NEVER
NEVER
NEVER
FAIL
OPTIONAL
NEVER
NEVER
PREFERRED
REQUIRED
PREFERRED
NEVER
PREFERRED
PREFERRED
REQUIRED
REQUIRED
FAIL
REQUIRED
REQUIRED
REQUIRED
DISABLE_GWMS
DISABLE
DISABLE
DISABLE
DISABLE (FAIL)
GlExec
All of the glideins in a Frontend group are submitted with the same service proxy. gLExec is a tool, now unsupported, that was used before containers to achieve security (sandboxing) and isolation between different jobs in the same Glidein. It used a service with sudo privileges to switch the jobs to different Unix users dpending on the identity in a x509/VOMS Proxy. To obtain the same results look into Singularity. This section is kept for historical reasons.
Private networks and firewalls
HTCondor daemons need two way communication in order to work properly. This clashes with the network policies of most Grid sites, that have worker nodes in private networks or implement a restrictive firewall.
HTCondor provides the CCB mechanism to address this. It was providing also a second mechanism, GCB, but it is no more supported after HTCondor 7.8, so remove it if you still have it in your configuration.
CCB - Condor Connection Broker
The detailed description of CCB is beyond the scope of this manual and
you should refer to the HTCondor documentation available
here. , where you will find only the parameters needed to enable it in
the glideins.
To use HTCondor with CCB, you need to specify:
[<entries><entry>]
<attrs>
<attr name="USE_CCB" value="True" publish="False" parameter="True"/>
and you are done. Just make sure you follow the suggested scalability guidelines described in the HTCondor manual.
Security handles
As mentioned in the
startup page, the glidein
pool must be properly configured to protect it from hackers and
malicious users. The same page also describes what needs to be done on
the collector machine.
The glidein itself can also be configured.
The default configuration works fine for most users, but you may need
to change them.
The values are set using the
<attr /> option, and the default values are:
- SEC_DEFAULT_ENCRYPTION=OPTIONAL
- SEC_DEFAULT_INTEGRITY=REQUIRED
- DELEGATE_JOB_GSI_CREDENTIALS=False
HTCondor also supports a more efficient authentication mechanism between the condor_schedd/condor_shadow and condor_startd/condor_starter. This method uses the match ClaimId as a shared password for authentication between these daemons. Since using a shared secret is much cheaper that using GSI authentication, this should be used every time it is feasible.
This option is enabled by default. <attr /> option:
<attr name=USE_MATCH_AUTH ... value=True.. /> ... enabled
<attr name=USE_MATCH_AUTH ... value=False.. /> ... disabled
When enabled, this HTCondor attribute must be set in the
condor_config of the submit machine.
This option is not used by the HTCondor negotiator or collector and
therefore not needed if they are installed separately.
SEC_ENABLE_MATCH_PASSWORD_AUTHENTICATION = True
Using TCP to send updates to the Collector
By default, HTCondor uses UDP packets to communicate between the glideins and the Collector. While more efficient than TCP, UDP packets are often blocked at the firewall, or lost on the WAN.
To disable TCP updates, specify, with the <attr/> option:
UPDATE_COLLECTOR_WITH_TCP=False
In GlideinWMS, we enable the glideins to update the user collector
using TCP by default.
Please be aware that this will configure
the glideins only; you still need to properly configure the Collector
machine. See
Condor documentation
for more details.
Multiple Collectors
By default, HTCondor uses only one Collector for the glidein (user) pool. However, if the load becomes too high on the collector, you can configure multiple collectors in a chain.
You will need a master and a set of slave collectors. Each slave collector will service a portion of the pool and will forward communication between the startd daemons to the master collector. Machine ClassAds from these startd's will be sent to the master collector. The negotiator and the schedds will talk to the master collector, and the startds will talk to one of the slave ones. This will reduce load on the central manager.
To set up slave collector in the glidein (user) pool, one way is to set the following env variables before starting up the condor_master:
COLH=`condor_config_val COLLECTOR_HOST` LD=`condor_config_val LOCAL_DIR` export _CONDOR_COLLECTOR_HOST=$COLH: export _CONDOR_MASTER_NAME=collector_ export _CONDOR_DAEMON_LIST="MASTER, COLLECTOR" export _CONDOR_LOCAL_DIR=$LD/$_CONDOR_MASTER_NAME export _CONDOR_LOCK=$_CONDOR_LOCAL_DIR/lock # Forward all the traffic to the main collector export _CONDOR_CONDOR_VIEW_HOST=localhost:9618
Using
Once you have the slave collectors set up, you will want to use them.
The VO Frontend will have to point the Factory to a list of collectors.
The configuration internally will add a line in the Factory configuration file that will set up the glideins to handle the multiple collectors. (You should now see a line like: "<file absfname="web_base/collector_setup.sh" executable="True"/>" after reconfiguring).
Setting the glidein start and rank condition
As with any HTCondor pool, you may need to set the startd
start
and
rank
conditions.
For a glidein, you can set this with the <attr/> options:
GLIDEIN_Start=expression
GLIDEIN_Rank=expression
For example:
[<entries><entry>]
<attrs>
<attr name="GLIDEIN_Start" value="Owner=="sfiligoi"" publish="False" parameter="True"/>
<attr name="GLIDEIN_Rank" value="ImageSize" publish="False" parameter="True"/>
Internal Configuration
The configuration is parsed during the reconfiguration of the Factory, and split into a number of files:- job.descript => read by the daemon do decide how to work
- attributes.cfg => fixed values, these are published in the Factory ClassAd
- params.cfg => for values the Frontend will change, also published in the Factory ClassAd
Multiple Condor Tarballs
One frequent problem is that one particular condor binary will not run on all compute nodes. Entry points require different architectures, or have different versions of glibc.
The solution is to have multiple condor binaries. The way to do this is to specify a tarball tag in the Factory configuration file.
-
Download the HTCondor binary from the
University of Wisconsin
site. (Alternatively, You can build it from scratch on the
architecture. Refer to HTCondor instructions for this.)
The glideinwms pilot uses a subset of the condor binaries/libraries. The create_condor_tarball script can be used to reduce space needed on your Factory node. Details on this script can be found in the Components - Tools section of the documentation. -
Add a new condor_tarball tag to the Factory configuration
file:
There are two different ways you can do this:-
Put the tarball in a directory owned by the wmsfactory and enter
the condor_tarball tag as:
<glidein ... >
...
<condor_tarballs >
<condor_tarball os="OS" arch="Arch" tar_file="ZIPPED_TARFILE" version="Condor_Version" /> -
Put the tarball in a directory owned by the wmsfactory and
unzip/untar it. Then, enter the condor_tarball tag as:
<glidein ... >
...
<condor_tarballs >
<condor_tarball os="OS" arch="Arch" base_dir="DIR_OF_UNTARRED_BINARY" version="Condor_Version" />
To simplify the configuration, os, arch and version support comma values. This can drastically reduce the number of condor_tarball entries needed in the configuration file.
Consider an example below for default OS as rhel5 and default arch as x86_64. If the Factory admin also wants os and arch information explicitly available, the configuration needs following entries to cover possible combinations.<condor_tarball os="default" arch="default" base_dir="dir" version="default"/>
Above example can be easily consolidated into a single condor_tarball entry as below and the Factory reconfiguration process will internally consider all the combinations. This also applies to the version.
<condor_tarball os="rhel6" arch="x86_64" base_dir="dir" version="default"/>
<condor_tarball os="rhel6" arch="default" base_dir="dir" version="default"/>
<condor_tarball os="default" arch="x86_64" base_dir="dir" version="default"/><condor_tarball os="default,rhel6" arch="default,x86_64" base_dir="dir" version="default"/>
-
Put the tarball in a directory owned by the wmsfactory and enter
the condor_tarball tag as:
-
Verify your entry point attributes. Each
entry point will have the following attr set up.
Make sure that this matches the above
condor_tarball parameters:
<entry>
The CONDOR_OS and the CONDOR_ARCH should match the os and arch defined in the tarball tag. If set to "auto", the glidein will decide the appropriate tarball to use for that worker node. By default, the CONDOR_VERSION will be defined globally in <glidein><attrs> and should match the version in the condor_tarball tag. You can overwrite this global version and define one locally in the entry if needed.
<attrs>
<attr name="CONDOR_ARCH" const="True" parameter="True" glidein_publish="False" job_publish="False" publish="False" type="string" value="Arch"/>
<attr name="CONDOR_OS" const="True" parameter="True" glidein_publish="False" job_publish="False" publish="False" type="string" value="Condor_Version"/>
</attrs>
-
Reconfigure the Factory using the command:
./factory_startup reconfig ../CONFIG_DIR/glideinWMS.xml
-
After reconfig, you can see the tar_file created from the
condor distribution in the condor_tarball element in the
configuration as (if you populated just tar_file in step 2
<condor_tarball arch="default" os="default" tar_file="FACTORY_DIR/condor-8.7tgz" version="default"/>
or (if you populated just base_dir in step 2)<condor_tarball arch="default" base_dir="/opt/glideins/git-xen21-master-ps.ini/condor-wms" os="default" tar_file="/var/www/html/glideinwms/factory_service/stage/glidein_master/condor_bin_0.d7bbk9.tgz" version="default"/>
Limiting time spent on a Grid resource
The whole concept of gliding into Grid resources is based on the idea
that you are getting those resources on a temporary basis. This
implies that you need to leave the slot as soon as possible, else your
jobs will simply be killed by the annoyed Grid administrators.
On the other hand, submitting new glideins is not cost free, so you
want to keep the resource for at least some period of time.
The glideins have two mechanisms to regulate this:
-
After a specified amount of time, the glidein will enter the RETIRING state. This means, it will wait for the current job to finish (or kill it if it does not end within a configurable timeout) and exit immediately afterwards. This obviously implies that no new jobs will start after it entered that state.
The two timeouts can be set with the <attr /> options:GLIDEIN_Retire_Time=nr_of_seconds
GLIDEIN_Job_Max_Time=nr_of_secondsThe two default to 2 and 100 hours.
-
If a glidein is not claimed within a configurable timeout, the glidein will exit.
The timeout can be set with:GLIDEIN_Max_Idle=nr_of_seconds GLIDEIN_Max_Tail=nr_of_seconds
There are two configurable parameters for this timeout behavior. The first, GLIDEIN_Max_Idle, affects how long a glidein will wait for its first job. The second parameter is how long a glidein will wait to get a subsequent job once its finished its job. The defaults for these are 1200 and 400 seconds, respectively.
An example:
[<entries><entry>]
<attrs>
<attr name="GLIDEIN_Max_Idle" value="300" type="int" publish="False" parameter="True"/>
<attr name="GLIDEIN_Retire_Time" value="14400" type="int" publish="False" parameter="True"/>
<attr name="GLIDEIN_Job_Max_Time" value="180000" type="int" publish="False" parameter="True"/>
Old-style pseudo-interactive monitoring
The pseudo-interactive monitoring uses a dedicated startd in the glideins for monitoring purposes. This allows for monitoring even when the job starter enters the “Retiring” activity.
The side effect is that you do not have anymore the cross-VM statistics and the names of the slots is also different.
To enable the old mode, use:
Adding custom files/scripts to the glideins
While provided code should cover most of the general purpose use cases, some administrators may have additional needs. For these cases, the Glidein framework provides the possibility to download and process additional files. Both , Factory and Frontend configuration allow to specify lists of files either for all their glideins, for a specific entry (Factory), or job group (Frontend). These files can be scripts (executables), regular files or tarballs and depending on the options can be downloaded at different times (see the custom scripts document for more detils).
Note: Files and subsystems will be downloaded before the scripts. User provided scripts will be executed in the specified order, and before the HTCondor daemons are started up.
Here a list of the attributes of the files. Some examples follow below:
Attribute Name |
Attribute Description |
absfname |
Path of the file on the server (Factory or Frontend). The file name (basename of absfname) can be used also to control the conditional download of the file. If the file name starts with "gconditional_" (e.g. gconditional_FEATURE_myfile), then the name between the prefix and the following underscore is taken and GWMS will look for an attribute (in the attrs list) called GLIDEIN_USE_ followed by the name, and download and process the file only if the attribute is defined and not empty (e.g. GLIDEIN_USE_FEATURE=1). |
executable |
True if the file is a script (executable, see example below), default is False |
wrapper |
True if the file is a user wrapper (see example below), default is False |
untar |
True if the file is a tarball that needs to be expanded (see example below), default is False |
type |
File type, must be one of the valid types (regular, run, source, wrapper, untar). Can have qualifiers depending on the type after a colon (":"). Valid qualifier for run is "singularity" causes setup scripts to run in Singularity. Currently type is ignored for non executables and the only values used are "run" or "run:singularity". Default is empty |
const |
If False the file is not constant (i.e. changes may happen without a reconfiguration of the Factory and the file cannot be checksummed), default is True |
relfname |
Path to save the file, relative to the glidein main directory |
period |
If period>0 is the period (in seconds) of the executable . Default is 0 (non periodic script). This is ignored for non executable scripts. (see the custom scripts document for more) |
prefix |
STARTD_CRON prefix, it is prepended to all HTCondor variables
generated by the script (see documentation). This is ignored
from anything different from a periodic executable script. The
default value is |
comment |
Arbitrary comment string |
after_entry |
If True, the script is executed after the entry scripts. Default is False for the Factory, default is True for the Frontend. (see the custom scripts document for more) |
after_group |
If True, the Frontend script is executed after the group scripts. Default is False. Not considered in the Factory. (see the custom scripts document for more) |
-
<glidein>
[<entries><entry>]
<files>
<file absfname="script name" executable="True" prefix="cron_prefix" comment="comment"/>Path to the custom script. The script will be copied in the Web-accessible area, and when a glidein starts, the glidein startup script will pull it and execute it. If any parameters are needed, they can be specified using <attr />, or stored in a file (see below).
For more detailed information, see the page dedicated to writing custom scripts.<glidein>
[<entries><entry>]
<files>
<file absfname="script name" wrapper="True" comment="comment"/>Path to the wrapper custom script. The script will be copied in the Web-accessible area, and will be sourced just before a user job starts; i.e. it will become part of the user job wrapper.
<glidein>
[<entries><entry>]
<files>
<file absfname="local file name" relfname="target file name" const="Bool" executable="False" comment="comment"/>Path to the config file. The file will be copied in the Web-accessible area, and pulled by the glidein startup script when a glidein starts. It can be then used by any script (see above).
Note: Please be cautious in using the const flag; if set to False, the content of the file will not be verified by the glidein startup script and could be tampered in transit by a malicious user. So never put sensitive data (like the switch to disable security checks) in a changeable file. -
<glidein>
[<entries><entry>]
<files>
<file absfname="local file name" untar="True" comment="comment">
<untar_options cond_attr="conf_sw" dir="dir name" absdir_outattr="attr name"/>Sometimes it is useful to transfer a whole set of files, or even directories, and that is much easier to accomplish by means of a tarball. A subsystem is the glidein way to describe a compressed tarball that is delivered to the worker nodes, untarred in a separate directory and advertised to the other scripts.
- absfname: Path to the custom tar-ball. (like "/tmp/mytar_v12.5.tgz")
-
conf_sw: Name of a configuration switch. (like
"ENABLE_KRB5")
The tarball will be downloaded and unpacked only if that parameter will be set to 1. Use the <attr /> switch to define the default value. A special name TRUE can be used to always untar it. - dir: Name of the subdirectory to untar it in. (like "krb5")
-
absdir_outattr: Name of a variable name. (like
"KRB5_SUBSYS_DIR")
The variable will be set to the absolute path of the directory where the tarball was unpacked, if and only if the unpacking actually happened (else it will not be defined.) ENTRY_ will be prepended if if the <file> directive occurs in an entry.
Monitoring collectors
By default, the glideins talk to the VO Pool Collector only. This makes monitoring them from the Factory side extremely difficult.
To solve this, you can set up a Monitoring Collector tree that mirrors that of the VO Pool Collector, and tell the glideins to report there, too.
The configuration syntax is very similar to that of the
the VO Pool Collector, but using monitoring_collector instead of
collector keyword.
For example:
<monitoring_colectors>
<monitoring_colector DN="/DC=org/DC=doegrids/OU=Services/CN=factmoncollector.fnal.gov" node="factmoncollector.fnal.gov" secondary="False" group="default" /></monitoring_colectors>
<monitoring_colector DN="/DC=org/DC=doegrids/OU=Services/CN=factmoncollector.fnal.gov" node="factmoncollector.fnal.gov:9620-9819" secondary="True" group="default" />
<monitoring_colector DN="/DC=org/DC=doegrids/OU=Services/CN=factmoncollector2.fnal.gov" node="factmoncollector2.fnal.gov" secondary="False" group="ha" />
<monitoring_colector DN="/DC=org/DC=doegrids/OU=Services/CN=factmoncollector2.fnal.gov" node="factmoncollector2.fnal.gov:9620-9919" secondary="True" group="ha" />
For more details, see the Frontend documentation.
XSLT Plugins to extend configuration
You can use XSL transformations (XSLT) to manage complex configuration files. During the reconfig process, the glidein Factory applies XSL transformations available to it in the directory configured by the environment variable GWMS_XSLT_PLUGIN_DIR. (It is also available by supplying the -xslt_plugin_dir option to the reconfig_glidein and reconfig_frontend commands as shown below.)
Setting the variable via sysconfig files are supported: /etc/sysconfig/gwms-factory for the Factory service and /etc/sysconfig/gwms-frontend for the Frontend service, with the following contents:
prompt$ cat /etc/sysconfig/gwms-factory # Configuration file for the Glideinwms services. # # Plugin directory to get xslt transformations from. # export GWMS_XSLT_PLUGIN_DIR=/etc/gwms-factory/plugin.d
The following sample XSLT adds a custom attribute to the configuration file used by the Factory.
<?xml version="1.0"?> <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" > <xsl:template match="glidein/attrs"> <xsl:copy> <attr name="SOME_GWMS_ATTRIBUTE" const="False" glidein_publish="False" job_publish="False" parameter="True" publish="True" type="int" value="7000"/> <xsl:apply-templates select="@* | node()"/> </xsl:copy> </xsl:template> <xsl:template match="/ | @* | node()"> <xsl:copy> <xsl:apply-templates select="@* | node()"/> </xsl:copy> </xsl:template>
The default behavior can be overridden by specifying the option -xslt_plugin_dir to reconfig_glidein and reconfig_frontend tools.
prompt$ reconfig_glidein -xslt_plugin_dir <xslt directory> [...] prompt$ reconfig_frontend -xslt_plugin_dir <xslt directory> [...]
Glidein's Startd Advertising to Site HTCondor-CE Collector
You can make glidein's HTCondor daemons advertise to site's local collector. There is no Glideinwms configuration that enable this and all the changes are on the site side.
-
Give glidein write access to the site collector
This is done by adding glidein's DN to gridmapfile of the site collector -
Set site collector info in the glidein's environment
Set the CONDORCE_COLLECTOR_HOST in the glidein's environmentCONDORCE_COLLECTOR_HOST=<site-local HTCondorCE collector address>
Running pre/post reconfigure hooks
You can put executable scipts in the
/etc/gwms-factory/hooks.reconfig.pre/or the
/etc/gwms-factory/hooks.reconfig.pre/directories. These scripts will be executed every time you reconfigure the factory. The
.prescripts will be executed before the reconfiguration process begins. The
.postwill be executed after the reconfigurations has been done. Scripts will be executed in
/var/lib/gwms-factory/work-diras user
gfactory. Only executable scripts will be executed.